chiark / gitweb /
a81bb8ea4175077856f0aa7085c2f0bf688ccf68
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #include "sd-daemon.h"
56 #include "sd-bus.h"
57 #include "sd-id128.h"
58 #include "sd-rtnl.h"
59 #include "log.h"
60 #include "util.h"
61 #include "mkdir.h"
62 #include "macro.h"
63 #include "audit.h"
64 #include "missing.h"
65 #include "cgroup-util.h"
66 #include "strv.h"
67 #include "path-util.h"
68 #include "loopback-setup.h"
69 #include "dev-setup.h"
70 #include "fdset.h"
71 #include "build.h"
72 #include "fileio.h"
73 #include "bus-util.h"
74 #include "bus-error.h"
75 #include "ptyfwd.h"
76 #include "bus-kernel.h"
77 #include "env-util.h"
78 #include "def.h"
79 #include "rtnl-util.h"
80 #include "udev-util.h"
81
82 typedef enum LinkJournal {
83         LINK_NO,
84         LINK_AUTO,
85         LINK_HOST,
86         LINK_GUEST
87 } LinkJournal;
88
89 static char *arg_directory = NULL;
90 static char *arg_user = NULL;
91 static sd_id128_t arg_uuid = {};
92 static char *arg_machine = NULL;
93 static char *arg_selinux_context = NULL;
94 static char *arg_selinux_apifs_context = NULL;
95 static const char *arg_slice = NULL;
96 static bool arg_private_network = false;
97 static bool arg_read_only = false;
98 static bool arg_boot = false;
99 static LinkJournal arg_link_journal = LINK_AUTO;
100 static uint64_t arg_retain =
101         (1ULL << CAP_CHOWN) |
102         (1ULL << CAP_DAC_OVERRIDE) |
103         (1ULL << CAP_DAC_READ_SEARCH) |
104         (1ULL << CAP_FOWNER) |
105         (1ULL << CAP_FSETID) |
106         (1ULL << CAP_IPC_OWNER) |
107         (1ULL << CAP_KILL) |
108         (1ULL << CAP_LEASE) |
109         (1ULL << CAP_LINUX_IMMUTABLE) |
110         (1ULL << CAP_NET_BIND_SERVICE) |
111         (1ULL << CAP_NET_BROADCAST) |
112         (1ULL << CAP_NET_RAW) |
113         (1ULL << CAP_SETGID) |
114         (1ULL << CAP_SETFCAP) |
115         (1ULL << CAP_SETPCAP) |
116         (1ULL << CAP_SETUID) |
117         (1ULL << CAP_SYS_ADMIN) |
118         (1ULL << CAP_SYS_CHROOT) |
119         (1ULL << CAP_SYS_NICE) |
120         (1ULL << CAP_SYS_PTRACE) |
121         (1ULL << CAP_SYS_TTY_CONFIG) |
122         (1ULL << CAP_SYS_RESOURCE) |
123         (1ULL << CAP_SYS_BOOT) |
124         (1ULL << CAP_AUDIT_WRITE) |
125         (1ULL << CAP_AUDIT_CONTROL) |
126         (1ULL << CAP_MKNOD);
127 static char **arg_bind = NULL;
128 static char **arg_bind_ro = NULL;
129 static char **arg_setenv = NULL;
130 static bool arg_quiet = false;
131 static bool arg_share_system = false;
132 static bool arg_register = true;
133 static bool arg_keep_unit = false;
134 static char **arg_network_interfaces = NULL;
135 static bool arg_network_veth = false;
136
137 static int help(void) {
138
139         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
140                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
141                "  -h --help                 Show this help\n"
142                "     --version              Print version string\n"
143                "  -q --quiet                Do not show status information\n"
144                "  -D --directory=NAME       Root directory for the container\n"
145                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
146                "  -u --user=USER            Run the command under specified user or uid\n"
147                "  -M --machine=NAME         Set the machine name for the container\n"
148                "     --uuid=UUID            Set a specific machine UUID for the container\n"
149                "  -S --slice=SLICE          Place the container in the specified slice\n"
150                "     --private-network      Disable network in container\n"
151                "     --network-interface=INTERFACE\n"
152                "                            Assign an existing network interface to the\n"
153                "                            container\n"
154                "     --network-veth         Add a a virtual ethernet connection between host\n"
155                "                            and container\n"
156                "  -Z --selinux-context=SECLABEL\n"
157                "                            Set the SELinux security context to be used by\n"
158                "                            processes in the container\n"
159                "  -L --selinux-apifs-context=SECLABEL\n"
160                "                            Set the SELinux security context to be used by\n"
161                "                            API/tmpfs file systems in the container\n"
162                "     --capability=CAP       In addition to the default, retain specified\n"
163                "                            capability\n"
164                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
165                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
166                "  -j                        Equivalent to --link-journal=host\n"
167                "     --read-only            Mount the root directory read-only\n"
168                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
169                "                            the container\n"
170                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
171                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
172                "     --share-system         Share system namespaces with host\n"
173                "     --register=BOOLEAN     Register container as machine\n"
174                "     --keep-unit            Do not register a scope for the machine, reuse\n"
175                "                            the service unit nspawn is running in\n",
176                program_invocation_short_name);
177
178         return 0;
179 }
180
181 static int parse_argv(int argc, char *argv[]) {
182
183         enum {
184                 ARG_VERSION = 0x100,
185                 ARG_PRIVATE_NETWORK,
186                 ARG_UUID,
187                 ARG_READ_ONLY,
188                 ARG_CAPABILITY,
189                 ARG_DROP_CAPABILITY,
190                 ARG_LINK_JOURNAL,
191                 ARG_BIND,
192                 ARG_BIND_RO,
193                 ARG_SETENV,
194                 ARG_SHARE_SYSTEM,
195                 ARG_REGISTER,
196                 ARG_KEEP_UNIT,
197                 ARG_NETWORK_INTERFACE,
198                 ARG_NETWORK_VETH,
199         };
200
201         static const struct option options[] = {
202                 { "help",                  no_argument,       NULL, 'h'                   },
203                 { "version",               no_argument,       NULL, ARG_VERSION           },
204                 { "directory",             required_argument, NULL, 'D'                   },
205                 { "user",                  required_argument, NULL, 'u'                   },
206                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
207                 { "boot",                  no_argument,       NULL, 'b'                   },
208                 { "uuid",                  required_argument, NULL, ARG_UUID              },
209                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
210                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
211                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
212                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
213                 { "bind",                  required_argument, NULL, ARG_BIND              },
214                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
215                 { "machine",               required_argument, NULL, 'M'                   },
216                 { "slice",                 required_argument, NULL, 'S'                   },
217                 { "setenv",                required_argument, NULL, ARG_SETENV            },
218                 { "selinux-context",       required_argument, NULL, 'Z'                   },
219                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
220                 { "quiet",                 no_argument,       NULL, 'q'                   },
221                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
222                 { "register",              required_argument, NULL, ARG_REGISTER          },
223                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
224                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
225                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH   },
226                 {}
227         };
228
229         int c, r;
230         uint64_t plus = 0, minus = 0;
231
232         assert(argc >= 0);
233         assert(argv);
234
235         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
236
237                 switch (c) {
238
239                 case 'h':
240                         return help();
241
242                 case ARG_VERSION:
243                         puts(PACKAGE_STRING);
244                         puts(SYSTEMD_FEATURES);
245                         return 0;
246
247                 case 'D':
248                         free(arg_directory);
249                         arg_directory = canonicalize_file_name(optarg);
250                         if (!arg_directory) {
251                                 log_error("Invalid root directory: %m");
252                                 return -ENOMEM;
253                         }
254
255                         break;
256
257                 case 'u':
258                         free(arg_user);
259                         arg_user = strdup(optarg);
260                         if (!arg_user)
261                                 return log_oom();
262
263                         break;
264
265                 case ARG_NETWORK_VETH:
266                         arg_network_veth = true;
267                         arg_private_network = true;
268                         break;
269
270                 case ARG_NETWORK_INTERFACE:
271                         if (strv_push(&arg_network_interfaces, optarg) < 0)
272                                 return log_oom();
273
274                         /* fall through */
275
276                 case ARG_PRIVATE_NETWORK:
277                         arg_private_network = true;
278                         break;
279
280                 case 'b':
281                         arg_boot = true;
282                         break;
283
284                 case ARG_UUID:
285                         r = sd_id128_from_string(optarg, &arg_uuid);
286                         if (r < 0) {
287                                 log_error("Invalid UUID: %s", optarg);
288                                 return r;
289                         }
290                         break;
291
292                 case 'S':
293                         arg_slice = strdup(optarg);
294                         if (!arg_slice)
295                                 return log_oom();
296
297                         break;
298
299                 case 'M':
300                         if (isempty(optarg)) {
301                                 free(arg_machine);
302                                 arg_machine = NULL;
303                         } else {
304
305                                 if (!hostname_is_valid(optarg)) {
306                                         log_error("Invalid machine name: %s", optarg);
307                                         return -EINVAL;
308                                 }
309
310                                 free(arg_machine);
311                                 arg_machine = strdup(optarg);
312                                 if (!arg_machine)
313                                         return log_oom();
314
315                                 break;
316                         }
317
318                 case 'Z':
319                         arg_selinux_context = optarg;
320                         break;
321
322                 case 'L':
323                         arg_selinux_apifs_context = optarg;
324                         break;
325
326                 case ARG_READ_ONLY:
327                         arg_read_only = true;
328                         break;
329
330                 case ARG_CAPABILITY:
331                 case ARG_DROP_CAPABILITY: {
332                         char *state, *word;
333                         size_t length;
334
335                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
336                                 _cleanup_free_ char *t;
337                                 cap_value_t cap;
338
339                                 t = strndup(word, length);
340                                 if (!t)
341                                         return log_oom();
342
343                                 if (streq(t, "all")) {
344                                         if (c == ARG_CAPABILITY)
345                                                 plus = (uint64_t) -1;
346                                         else
347                                                 minus = (uint64_t) -1;
348                                 } else {
349                                         if (cap_from_name(t, &cap) < 0) {
350                                                 log_error("Failed to parse capability %s.", t);
351                                                 return -EINVAL;
352                                         }
353
354                                         if (c == ARG_CAPABILITY)
355                                                 plus |= 1ULL << (uint64_t) cap;
356                                         else
357                                                 minus |= 1ULL << (uint64_t) cap;
358                                 }
359                         }
360
361                         break;
362                 }
363
364                 case 'j':
365                         arg_link_journal = LINK_GUEST;
366                         break;
367
368                 case ARG_LINK_JOURNAL:
369                         if (streq(optarg, "auto"))
370                                 arg_link_journal = LINK_AUTO;
371                         else if (streq(optarg, "no"))
372                                 arg_link_journal = LINK_NO;
373                         else if (streq(optarg, "guest"))
374                                 arg_link_journal = LINK_GUEST;
375                         else if (streq(optarg, "host"))
376                                 arg_link_journal = LINK_HOST;
377                         else {
378                                 log_error("Failed to parse link journal mode %s", optarg);
379                                 return -EINVAL;
380                         }
381
382                         break;
383
384                 case ARG_BIND:
385                 case ARG_BIND_RO: {
386                         _cleanup_free_ char *a = NULL, *b = NULL;
387                         char *e;
388                         char ***x;
389
390                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
391
392                         e = strchr(optarg, ':');
393                         if (e) {
394                                 a = strndup(optarg, e - optarg);
395                                 b = strdup(e + 1);
396                         } else {
397                                 a = strdup(optarg);
398                                 b = strdup(optarg);
399                         }
400
401                         if (!a || !b)
402                                 return log_oom();
403
404                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
405                                 log_error("Invalid bind mount specification: %s", optarg);
406                                 return -EINVAL;
407                         }
408
409                         r = strv_extend(x, a);
410                         if (r < 0)
411                                 return log_oom();
412
413                         r = strv_extend(x, b);
414                         if (r < 0)
415                                 return log_oom();
416
417                         break;
418                 }
419
420                 case ARG_SETENV: {
421                         char **n;
422
423                         if (!env_assignment_is_valid(optarg)) {
424                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
425                                 return -EINVAL;
426                         }
427
428                         n = strv_env_set(arg_setenv, optarg);
429                         if (!n)
430                                 return log_oom();
431
432                         strv_free(arg_setenv);
433                         arg_setenv = n;
434                         break;
435                 }
436
437                 case 'q':
438                         arg_quiet = true;
439                         break;
440
441                 case ARG_SHARE_SYSTEM:
442                         arg_share_system = true;
443                         break;
444
445                 case ARG_REGISTER:
446                         r = parse_boolean(optarg);
447                         if (r < 0) {
448                                 log_error("Failed to parse --register= argument: %s", optarg);
449                                 return r;
450                         }
451
452                         arg_register = r;
453                         break;
454
455                 case ARG_KEEP_UNIT:
456                         arg_keep_unit = true;
457                         break;
458
459                 case '?':
460                         return -EINVAL;
461
462                 default:
463                         assert_not_reached("Unhandled option");
464                 }
465         }
466
467         if (arg_share_system)
468                 arg_register = false;
469
470         if (arg_boot && arg_share_system) {
471                 log_error("--boot and --share-system may not be combined.");
472                 return -EINVAL;
473         }
474
475         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
476                 log_error("--keep-unit may not be used when invoked from a user session.");
477                 return -EINVAL;
478         }
479
480         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
481
482         return 1;
483 }
484
485 static int mount_all(const char *dest) {
486
487         typedef struct MountPoint {
488                 const char *what;
489                 const char *where;
490                 const char *type;
491                 const char *options;
492                 unsigned long flags;
493                 bool fatal;
494         } MountPoint;
495
496         static const MountPoint mount_table[] = {
497                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
498                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
499                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
500                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
501                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
502                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
503                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
504                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
505 #ifdef HAVE_SELINUX
506                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
507                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
508 #endif
509         };
510
511         unsigned k;
512         int r = 0;
513
514         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
515                 _cleanup_free_ char *where = NULL;
516 #ifdef HAVE_SELINUX
517                 _cleanup_free_ char *options = NULL;
518 #endif
519                 const char *o;
520                 int t;
521
522                 where = strjoin(dest, "/", mount_table[k].where, NULL);
523                 if (!where)
524                         return log_oom();
525
526                 t = path_is_mount_point(where, true);
527                 if (t < 0) {
528                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
529
530                         if (r == 0)
531                                 r = t;
532
533                         continue;
534                 }
535
536                 /* Skip this entry if it is not a remount. */
537                 if (mount_table[k].what && t > 0)
538                         continue;
539
540                 mkdir_p(where, 0755);
541
542 #ifdef HAVE_SELINUX
543                 if (arg_selinux_apifs_context &&
544                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
545                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
546                         if (!options)
547                                 return log_oom();
548
549                         o = options;
550                 } else
551 #endif
552                         o = mount_table[k].options;
553
554
555                 if (mount(mount_table[k].what,
556                           where,
557                           mount_table[k].type,
558                           mount_table[k].flags,
559                           o) < 0 &&
560                     mount_table[k].fatal) {
561
562                         log_error("mount(%s) failed: %m", where);
563
564                         if (r == 0)
565                                 r = -errno;
566                 }
567         }
568
569         return r;
570 }
571
572 static int mount_binds(const char *dest, char **l, unsigned long flags) {
573         char **x, **y;
574
575         STRV_FOREACH_PAIR(x, y, l) {
576                 char *where;
577                 struct stat source_st, dest_st;
578                 int r;
579
580                 if (stat(*x, &source_st) < 0) {
581                         log_error("failed to stat %s: %m", *x);
582                         return -errno;
583                 }
584
585                 where = strappenda(dest, *y);
586                 r = stat(where, &dest_st);
587                 if (r == 0) {
588                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
589                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
590                                                 *x, where);
591                                 return -EINVAL;
592                         }
593                 } else if (errno == ENOENT) {
594                         r = mkdir_parents_label(where, 0755);
595                         if (r < 0) {
596                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
597                                 return r;
598                         }
599                 } else {
600                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
601                         return -errno;
602                 }
603                 /* Create the mount point, but be conservative -- refuse to create block
604                 * and char devices. */
605                 if (S_ISDIR(source_st.st_mode))
606                         mkdir_label(where, 0755);
607                 else if (S_ISFIFO(source_st.st_mode))
608                         mkfifo(where, 0644);
609                 else if (S_ISSOCK(source_st.st_mode))
610                         mknod(where, 0644 | S_IFSOCK, 0);
611                 else if (S_ISREG(source_st.st_mode))
612                         touch(where);
613                 else {
614                         log_error("Refusing to create mountpoint for file: %s", *x);
615                         return -ENOTSUP;
616                 }
617
618                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
619                         log_error("mount(%s) failed: %m", where);
620                         return -errno;
621                 }
622
623                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
624                         log_error("mount(%s) failed: %m", where);
625                         return -errno;
626                 }
627         }
628
629         return 0;
630 }
631
632 static int setup_timezone(const char *dest) {
633         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
634         char *z, *y;
635         int r;
636
637         assert(dest);
638
639         /* Fix the timezone, if possible */
640         r = readlink_malloc("/etc/localtime", &p);
641         if (r < 0) {
642                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
643                 return 0;
644         }
645
646         z = path_startswith(p, "../usr/share/zoneinfo/");
647         if (!z)
648                 z = path_startswith(p, "/usr/share/zoneinfo/");
649         if (!z) {
650                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
651                 return 0;
652         }
653
654         where = strappend(dest, "/etc/localtime");
655         if (!where)
656                 return log_oom();
657
658         r = readlink_malloc(where, &q);
659         if (r >= 0) {
660                 y = path_startswith(q, "../usr/share/zoneinfo/");
661                 if (!y)
662                         y = path_startswith(q, "/usr/share/zoneinfo/");
663
664
665                 /* Already pointing to the right place? Then do nothing .. */
666                 if (y && streq(y, z))
667                         return 0;
668         }
669
670         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
671         if (!check)
672                 return log_oom();
673
674         if (access(check, F_OK) < 0) {
675                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
676                 return 0;
677         }
678
679         what = strappend("../usr/share/zoneinfo/", z);
680         if (!what)
681                 return log_oom();
682
683         unlink(where);
684         if (symlink(what, where) < 0) {
685                 log_error("Failed to correct timezone of container: %m");
686                 return 0;
687         }
688
689         return 0;
690 }
691
692 static int setup_resolv_conf(const char *dest) {
693         char _cleanup_free_ *where = NULL;
694
695         assert(dest);
696
697         if (arg_private_network)
698                 return 0;
699
700         /* Fix resolv.conf, if possible */
701         where = strappend(dest, "/etc/resolv.conf");
702         if (!where)
703                 return log_oom();
704
705         /* We don't really care for the results of this really. If it
706          * fails, it fails, but meh... */
707         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
708
709         return 0;
710 }
711
712 static int setup_boot_id(const char *dest) {
713         _cleanup_free_ char *from = NULL, *to = NULL;
714         sd_id128_t rnd;
715         char as_uuid[37];
716         int r;
717
718         assert(dest);
719
720         if (arg_share_system)
721                 return 0;
722
723         /* Generate a new randomized boot ID, so that each boot-up of
724          * the container gets a new one */
725
726         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
727         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
728         if (!from || !to)
729                 return log_oom();
730
731         r = sd_id128_randomize(&rnd);
732         if (r < 0) {
733                 log_error("Failed to generate random boot id: %s", strerror(-r));
734                 return r;
735         }
736
737         snprintf(as_uuid, sizeof(as_uuid),
738                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
739                  SD_ID128_FORMAT_VAL(rnd));
740         char_array_0(as_uuid);
741
742         r = write_string_file(from, as_uuid);
743         if (r < 0) {
744                 log_error("Failed to write boot id: %s", strerror(-r));
745                 return r;
746         }
747
748         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
749                 log_error("Failed to bind mount boot id: %m");
750                 r = -errno;
751         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
752                 log_warning("Failed to make boot id read-only: %m");
753
754         unlink(from);
755         return r;
756 }
757
758 static int copy_devnodes(const char *dest) {
759
760         static const char devnodes[] =
761                 "null\0"
762                 "zero\0"
763                 "full\0"
764                 "random\0"
765                 "urandom\0"
766                 "tty\0";
767
768         const char *d;
769         int r = 0;
770         _cleanup_umask_ mode_t u;
771
772         assert(dest);
773
774         u = umask(0000);
775
776         NULSTR_FOREACH(d, devnodes) {
777                 _cleanup_free_ char *from = NULL, *to = NULL;
778                 struct stat st;
779
780                 from = strappend("/dev/", d);
781                 to = strjoin(dest, "/dev/", d, NULL);
782                 if (!from || !to)
783                         return log_oom();
784
785                 if (stat(from, &st) < 0) {
786
787                         if (errno != ENOENT) {
788                                 log_error("Failed to stat %s: %m", from);
789                                 return -errno;
790                         }
791
792                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
793
794                         log_error("%s is not a char or block device, cannot copy", from);
795                         return -EIO;
796
797                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
798
799                         log_error("mknod(%s) failed: %m", dest);
800                         return  -errno;
801                 }
802         }
803
804         return r;
805 }
806
807 static int setup_ptmx(const char *dest) {
808         _cleanup_free_ char *p = NULL;
809
810         p = strappend(dest, "/dev/ptmx");
811         if (!p)
812                 return log_oom();
813
814         if (symlink("pts/ptmx", p) < 0) {
815                 log_error("Failed to create /dev/ptmx symlink: %m");
816                 return -errno;
817         }
818
819         return 0;
820 }
821
822 static int setup_dev_console(const char *dest, const char *console) {
823         struct stat st;
824         _cleanup_free_ char *to = NULL;
825         int r;
826         _cleanup_umask_ mode_t u;
827
828         assert(dest);
829         assert(console);
830
831         u = umask(0000);
832
833         if (stat(console, &st) < 0) {
834                 log_error("Failed to stat %s: %m", console);
835                 return -errno;
836
837         } else if (!S_ISCHR(st.st_mode)) {
838                 log_error("/dev/console is not a char device");
839                 return -EIO;
840         }
841
842         r = chmod_and_chown(console, 0600, 0, 0);
843         if (r < 0) {
844                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
845                 return r;
846         }
847
848         if (asprintf(&to, "%s/dev/console", dest) < 0)
849                 return log_oom();
850
851         /* We need to bind mount the right tty to /dev/console since
852          * ptys can only exist on pts file systems. To have something
853          * to bind mount things on we create a device node first, that
854          * has the right major/minor (note that the major minor
855          * doesn't actually matter here, since we mount it over
856          * anyway). */
857
858         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
859                 log_error("mknod() for /dev/console failed: %m");
860                 return -errno;
861         }
862
863         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
864                 log_error("Bind mount for /dev/console failed: %m");
865                 return -errno;
866         }
867
868         return 0;
869 }
870
871 static int setup_kmsg(const char *dest, int kmsg_socket) {
872         _cleanup_free_ char *from = NULL, *to = NULL;
873         int r, fd, k;
874         _cleanup_umask_ mode_t u;
875         union {
876                 struct cmsghdr cmsghdr;
877                 uint8_t buf[CMSG_SPACE(sizeof(int))];
878         } control = {};
879         struct msghdr mh = {
880                 .msg_control = &control,
881                 .msg_controllen = sizeof(control),
882         };
883         struct cmsghdr *cmsg;
884
885         assert(dest);
886         assert(kmsg_socket >= 0);
887
888         u = umask(0000);
889
890         /* We create the kmsg FIFO as /dev/kmsg, but immediately
891          * delete it after bind mounting it to /proc/kmsg. While FIFOs
892          * on the reading side behave very similar to /proc/kmsg,
893          * their writing side behaves differently from /dev/kmsg in
894          * that writing blocks when nothing is reading. In order to
895          * avoid any problems with containers deadlocking due to this
896          * we simply make /dev/kmsg unavailable to the container. */
897         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
898             asprintf(&to, "%s/proc/kmsg", dest) < 0)
899                 return log_oom();
900
901         if (mkfifo(from, 0600) < 0) {
902                 log_error("mkfifo() for /dev/kmsg failed: %m");
903                 return -errno;
904         }
905
906         r = chmod_and_chown(from, 0600, 0, 0);
907         if (r < 0) {
908                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
909                 return r;
910         }
911
912         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
913                 log_error("Bind mount for /proc/kmsg failed: %m");
914                 return -errno;
915         }
916
917         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
918         if (fd < 0) {
919                 log_error("Failed to open fifo: %m");
920                 return -errno;
921         }
922
923         cmsg = CMSG_FIRSTHDR(&mh);
924         cmsg->cmsg_level = SOL_SOCKET;
925         cmsg->cmsg_type = SCM_RIGHTS;
926         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
927         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
928
929         mh.msg_controllen = cmsg->cmsg_len;
930
931         /* Store away the fd in the socket, so that it stays open as
932          * long as we run the child */
933         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
934         close_nointr_nofail(fd);
935
936         if (k < 0) {
937                 log_error("Failed to send FIFO fd: %m");
938                 return -errno;
939         }
940
941         /* And now make the FIFO unavailable as /dev/kmsg... */
942         unlink(from);
943         return 0;
944 }
945
946 static int setup_hostname(void) {
947
948         if (arg_share_system)
949                 return 0;
950
951         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
952                 return -errno;
953
954         return 0;
955 }
956
957 static int setup_journal(const char *directory) {
958         sd_id128_t machine_id, this_id;
959         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
960         char *id;
961         int r;
962
963         p = strappend(directory, "/etc/machine-id");
964         if (!p)
965                 return log_oom();
966
967         r = read_one_line_file(p, &b);
968         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
969                 return 0;
970         else if (r < 0) {
971                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
972                 return r;
973         }
974
975         id = strstrip(b);
976         if (isempty(id) && arg_link_journal == LINK_AUTO)
977                 return 0;
978
979         /* Verify validity */
980         r = sd_id128_from_string(id, &machine_id);
981         if (r < 0) {
982                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
983                 return r;
984         }
985
986         r = sd_id128_get_machine(&this_id);
987         if (r < 0) {
988                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
989                 return r;
990         }
991
992         if (sd_id128_equal(machine_id, this_id)) {
993                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
994                          "Host and machine ids are equal (%s): refusing to link journals", id);
995                 if (arg_link_journal == LINK_AUTO)
996                         return 0;
997                 return
998                         -EEXIST;
999         }
1000
1001         if (arg_link_journal == LINK_NO)
1002                 return 0;
1003
1004         free(p);
1005         p = strappend("/var/log/journal/", id);
1006         q = strjoin(directory, "/var/log/journal/", id, NULL);
1007         if (!p || !q)
1008                 return log_oom();
1009
1010         if (path_is_mount_point(p, false) > 0) {
1011                 if (arg_link_journal != LINK_AUTO) {
1012                         log_error("%s: already a mount point, refusing to use for journal", p);
1013                         return -EEXIST;
1014                 }
1015
1016                 return 0;
1017         }
1018
1019         if (path_is_mount_point(q, false) > 0) {
1020                 if (arg_link_journal != LINK_AUTO) {
1021                         log_error("%s: already a mount point, refusing to use for journal", q);
1022                         return -EEXIST;
1023                 }
1024
1025                 return 0;
1026         }
1027
1028         r = readlink_and_make_absolute(p, &d);
1029         if (r >= 0) {
1030                 if ((arg_link_journal == LINK_GUEST ||
1031                      arg_link_journal == LINK_AUTO) &&
1032                     path_equal(d, q)) {
1033
1034                         r = mkdir_p(q, 0755);
1035                         if (r < 0)
1036                                 log_warning("failed to create directory %s: %m", q);
1037                         return 0;
1038                 }
1039
1040                 if (unlink(p) < 0) {
1041                         log_error("Failed to remove symlink %s: %m", p);
1042                         return -errno;
1043                 }
1044         } else if (r == -EINVAL) {
1045
1046                 if (arg_link_journal == LINK_GUEST &&
1047                     rmdir(p) < 0) {
1048
1049                         if (errno == ENOTDIR) {
1050                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1051                                 return r;
1052                         } else {
1053                                 log_error("Failed to remove %s: %m", p);
1054                                 return -errno;
1055                         }
1056                 }
1057         } else if (r != -ENOENT) {
1058                 log_error("readlink(%s) failed: %m", p);
1059                 return r;
1060         }
1061
1062         if (arg_link_journal == LINK_GUEST) {
1063
1064                 if (symlink(q, p) < 0) {
1065                         log_error("Failed to symlink %s to %s: %m", q, p);
1066                         return -errno;
1067                 }
1068
1069                 r = mkdir_p(q, 0755);
1070                 if (r < 0)
1071                         log_warning("failed to create directory %s: %m", q);
1072                 return 0;
1073         }
1074
1075         if (arg_link_journal == LINK_HOST) {
1076                 r = mkdir_p(p, 0755);
1077                 if (r < 0) {
1078                         log_error("Failed to create %s: %m", p);
1079                         return r;
1080                 }
1081
1082         } else if (access(p, F_OK) < 0)
1083                 return 0;
1084
1085         if (dir_is_empty(q) == 0) {
1086                 log_error("%s not empty.", q);
1087                 return -ENOTEMPTY;
1088         }
1089
1090         r = mkdir_p(q, 0755);
1091         if (r < 0) {
1092                 log_error("Failed to create %s: %m", q);
1093                 return r;
1094         }
1095
1096         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1097                 log_error("Failed to bind mount journal from host into guest: %m");
1098                 return -errno;
1099         }
1100
1101         return 0;
1102 }
1103
1104 static int setup_kdbus(const char *dest, const char *path) {
1105         const char *p;
1106
1107         if (!path)
1108                 return 0;
1109
1110         p = strappenda(dest, "/dev/kdbus");
1111         if (mkdir(p, 0755) < 0) {
1112                 log_error("Failed to create kdbus path: %m");
1113                 return  -errno;
1114         }
1115
1116         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1117                 log_error("Failed to mount kdbus domain path: %m");
1118                 return -errno;
1119         }
1120
1121         return 0;
1122 }
1123
1124 static int drop_capabilities(void) {
1125         return capability_bounding_set_drop(~arg_retain, false);
1126 }
1127
1128 static int register_machine(pid_t pid) {
1129         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1130         _cleanup_bus_unref_ sd_bus *bus = NULL;
1131         int r;
1132
1133         if (!arg_register)
1134                 return 0;
1135
1136         r = sd_bus_default_system(&bus);
1137         if (r < 0) {
1138                 log_error("Failed to open system bus: %s", strerror(-r));
1139                 return r;
1140         }
1141
1142         if (arg_keep_unit) {
1143                 r = sd_bus_call_method(
1144                                 bus,
1145                                 "org.freedesktop.machine1",
1146                                 "/org/freedesktop/machine1",
1147                                 "org.freedesktop.machine1.Manager",
1148                                 "RegisterMachine",
1149                                 &error,
1150                                 NULL,
1151                                 "sayssus",
1152                                 arg_machine,
1153                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1154                                 "nspawn",
1155                                 "container",
1156                                 (uint32_t) pid,
1157                                 strempty(arg_directory));
1158         } else {
1159                 r = sd_bus_call_method(
1160                                 bus,
1161                                 "org.freedesktop.machine1",
1162                                 "/org/freedesktop/machine1",
1163                                 "org.freedesktop.machine1.Manager",
1164                                 "CreateMachine",
1165                                 &error,
1166                                 NULL,
1167                                 "sayssusa(sv)",
1168                                 arg_machine,
1169                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1170                                 "nspawn",
1171                                 "container",
1172                                 (uint32_t) pid,
1173                                 strempty(arg_directory),
1174                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1175         }
1176
1177         if (r < 0) {
1178                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1179                 return r;
1180         }
1181
1182         return 0;
1183 }
1184
1185 static int terminate_machine(pid_t pid) {
1186         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1187         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1188         _cleanup_bus_unref_ sd_bus *bus = NULL;
1189         const char *path;
1190         int r;
1191
1192         if (!arg_register)
1193                 return 0;
1194
1195         r = sd_bus_default_system(&bus);
1196         if (r < 0) {
1197                 log_error("Failed to open system bus: %s", strerror(-r));
1198                 return r;
1199         }
1200
1201         r = sd_bus_call_method(
1202                         bus,
1203                         "org.freedesktop.machine1",
1204                         "/org/freedesktop/machine1",
1205                         "org.freedesktop.machine1.Manager",
1206                         "GetMachineByPID",
1207                         &error,
1208                         &reply,
1209                         "u",
1210                         (uint32_t) pid);
1211         if (r < 0) {
1212                 /* Note that the machine might already have been
1213                  * cleaned up automatically, hence don't consider it a
1214                  * failure if we cannot get the machine object. */
1215                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1216                 return 0;
1217         }
1218
1219         r = sd_bus_message_read(reply, "o", &path);
1220         if (r < 0)
1221                 return bus_log_parse_error(r);
1222
1223         r = sd_bus_call_method(
1224                         bus,
1225                         "org.freedesktop.machine1",
1226                         path,
1227                         "org.freedesktop.machine1.Machine",
1228                         "Terminate",
1229                         &error,
1230                         NULL,
1231                         NULL);
1232         if (r < 0) {
1233                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1234                 return 0;
1235         }
1236
1237         return 0;
1238 }
1239
1240 static int reset_audit_loginuid(void) {
1241         _cleanup_free_ char *p = NULL;
1242         int r;
1243
1244         if (arg_share_system)
1245                 return 0;
1246
1247         r = read_one_line_file("/proc/self/loginuid", &p);
1248         if (r == -EEXIST)
1249                 return 0;
1250         if (r < 0) {
1251                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1252                 return r;
1253         }
1254
1255         /* Already reset? */
1256         if (streq(p, "4294967295"))
1257                 return 0;
1258
1259         r = write_string_file("/proc/self/loginuid", "4294967295");
1260         if (r < 0) {
1261                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1262                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1263                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1264                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1265                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1266
1267                 sleep(5);
1268         }
1269
1270         return 0;
1271 }
1272
1273 static int setup_veth(int netns_fd) {
1274         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1275         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1276         char iface_name[IFNAMSIZ] = "ve-";
1277         int r;
1278
1279         if (!arg_private_network)
1280                 return 0;
1281
1282         if (!arg_network_veth)
1283                 return 0;
1284
1285         strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1286
1287         r = sd_rtnl_open(0, &rtnl);
1288         if (r < 0) {
1289                 log_error("Failed to connect to netlink: %s", strerror(-r));
1290                 return r;
1291         }
1292
1293         r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1294         if (r < 0) {
1295                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1296                 return r;
1297         }
1298
1299         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1300         if (r < 0) {
1301                 log_error("Failed to append netlink kind: %s", strerror(-r));
1302                 return r;
1303         }
1304
1305         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1306         if (r < 0) {
1307                 log_error("Failed to open netlink container: %s", strerror(-r));
1308                 return r;
1309         }
1310
1311         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1312         if (r < 0) {
1313                 log_error("Failed to append netlink kind: %s", strerror(-r));
1314                 return r;
1315         }
1316
1317         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1318         if (r < 0) {
1319                 log_error("Failed to open netlink container: %s", strerror(-r));
1320                 return r;
1321         }
1322
1323         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1324         if (r < 0) {
1325                 log_error("z Failed to open netlink container: %s", strerror(-r));
1326                 return r;
1327         }
1328
1329         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1330         if (r < 0) {
1331                 log_error("Failed to append netlink kind: %s", strerror(-r));
1332                 return r;
1333         }
1334
1335         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1336         if (r < 0) {
1337                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1338                 return r;
1339         }
1340
1341         r = sd_rtnl_message_close_container(m);
1342         if (r < 0) {
1343                 log_error("Failed to close netlink container: %s", strerror(-r));
1344                 return r;
1345         }
1346
1347         r = sd_rtnl_message_close_container(m);
1348         if (r < 0) {
1349                 log_error("Failed to close netlink container: %s", strerror(-r));
1350                 return r;
1351         }
1352
1353         r = sd_rtnl_message_close_container(m);
1354         if (r < 0) {
1355                 log_error("Failed to close netlink container: %s", strerror(-r));
1356                 return r;
1357         }
1358
1359         r = sd_rtnl_call(rtnl, m, 0, NULL);
1360         if (r < 0) {
1361                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1362                 return r;
1363         }
1364
1365         return 0;
1366 }
1367
1368 static int move_network_interfaces(pid_t pid) {
1369         _cleanup_udev_unref_ struct udev *udev = NULL;
1370         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1371         char **i;
1372         int r;
1373
1374         if (!arg_private_network)
1375                 return 0;
1376
1377         if (strv_isempty(arg_network_interfaces))
1378                 return 0;
1379
1380         r = sd_rtnl_open(0, &rtnl);
1381         if (r < 0) {
1382                 log_error("Failed to connect to netlink: %s", strerror(-r));
1383                 return r;
1384         }
1385
1386         udev = udev_new();
1387         if (!udev) {
1388                 log_error("Failed to connect to udev.");
1389                 return -ENOMEM;
1390         }
1391
1392         STRV_FOREACH(i, arg_network_interfaces) {
1393                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1394                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1395                 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1396                 int ifi;
1397
1398                 ifi = (int) if_nametoindex(*i);
1399                 if (ifi <= 0) {
1400                         log_error("Failed to resolve interface %s: %m", *i);
1401                         return -errno;
1402                 }
1403
1404                 sprintf(ifi_str, "n%i", ifi);
1405                 d = udev_device_new_from_device_id(udev, ifi_str);
1406                 if (!d) {
1407                         log_error("Failed to get udev device for interface %s: %m", *i);
1408                         return -errno;
1409                 }
1410
1411                 if (udev_device_get_is_initialized(d) <= 0) {
1412                         log_error("Network interface %s is not initialized yet.", *i);
1413                         return -EBUSY;
1414                 }
1415
1416                 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1417                 if (r < 0) {
1418                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1419                         return r;
1420                 }
1421
1422                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1423                 if (r < 0) {
1424                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1425                         return r;
1426                 }
1427
1428                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1429                 if (r < 0) {
1430                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1431                         return r;
1432                 }
1433         }
1434
1435         return 0;
1436 }
1437
1438 static int audit_still_doesnt_work_in_containers(void) {
1439
1440 #ifdef HAVE_SECCOMP
1441         scmp_filter_ctx seccomp;
1442         int r;
1443
1444         /*
1445            Audit is broken in containers, much of the userspace audit
1446            hookup will fail if running inside a container. We don't
1447            care and just turn off creation of audit sockets.
1448
1449            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1450            with EAFNOSUPPORT which audit userspace uses as indication
1451            that audit is disabled in the kernel.
1452          */
1453
1454         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1455         if (!seccomp)
1456                 return log_oom();
1457
1458         r = seccomp_rule_add_exact(
1459                         seccomp,
1460                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1461                         SCMP_SYS(socket),
1462                         2,
1463                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1464                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1465         if (r < 0) {
1466                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1467                 goto finish;
1468         }
1469
1470         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1471         if (r < 0) {
1472                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1473                 goto finish;
1474         }
1475
1476         r = seccomp_load(seccomp);
1477         if (r < 0)
1478                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1479
1480 finish:
1481         seccomp_release(seccomp);
1482         return r;
1483 #else
1484         return 0;
1485 #endif
1486
1487 }
1488
1489 int main(int argc, char *argv[]) {
1490
1491         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1492         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1493         _cleanup_free_ char *kdbus_domain = NULL;
1494         _cleanup_fdset_free_ FDSet *fds = NULL;
1495         const char *console = NULL;
1496         int r = EXIT_FAILURE, k;
1497         int n_fd_passed;
1498         pid_t pid = 0;
1499         sigset_t mask;
1500
1501         log_parse_environment();
1502         log_open();
1503
1504         k = parse_argv(argc, argv);
1505         if (k < 0)
1506                 goto finish;
1507         else if (k == 0) {
1508                 r = EXIT_SUCCESS;
1509                 goto finish;
1510         }
1511
1512         if (arg_directory) {
1513                 char *p;
1514
1515                 p = path_make_absolute_cwd(arg_directory);
1516                 free(arg_directory);
1517                 arg_directory = p;
1518         } else
1519                 arg_directory = get_current_dir_name();
1520
1521         if (!arg_directory) {
1522                 log_error("Failed to determine path, please use -D.");
1523                 goto finish;
1524         }
1525
1526         path_kill_slashes(arg_directory);
1527
1528         if (!arg_machine) {
1529                 arg_machine = strdup(basename(arg_directory));
1530                 if (!arg_machine) {
1531                         log_oom();
1532                         goto finish;
1533                 }
1534
1535                 hostname_cleanup(arg_machine, false);
1536                 if (isempty(arg_machine)) {
1537                         log_error("Failed to determine machine name automatically, please use -M.");
1538                         goto finish;
1539                 }
1540         }
1541
1542         if (geteuid() != 0) {
1543                 log_error("Need to be root.");
1544                 goto finish;
1545         }
1546
1547         if (sd_booted() <= 0) {
1548                 log_error("Not running on a systemd system.");
1549                 goto finish;
1550         }
1551
1552         if (path_equal(arg_directory, "/")) {
1553                 log_error("Spawning container on root directory not supported.");
1554                 goto finish;
1555         }
1556
1557         if (arg_boot) {
1558                 if (path_is_os_tree(arg_directory) <= 0) {
1559                         log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1560                         goto finish;
1561                 }
1562         } else {
1563                 const char *p;
1564
1565                 p = strappenda(arg_directory,
1566                                argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1567                 if (access(p, F_OK) < 0) {
1568                         log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1569                         goto finish;
1570
1571                 }
1572         }
1573
1574         log_close();
1575         n_fd_passed = sd_listen_fds(false);
1576         if (n_fd_passed > 0) {
1577                 k = fdset_new_listen_fds(&fds, false);
1578                 if (k < 0) {
1579                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1580                         goto finish;
1581                 }
1582         }
1583         fdset_close_others(fds);
1584         log_open();
1585
1586         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1587         if (master < 0) {
1588                 log_error("Failed to acquire pseudo tty: %m");
1589                 goto finish;
1590         }
1591
1592         console = ptsname(master);
1593         if (!console) {
1594                 log_error("Failed to determine tty name: %m");
1595                 goto finish;
1596         }
1597
1598         if (!arg_quiet)
1599                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1600
1601         if (unlockpt(master) < 0) {
1602                 log_error("Failed to unlock tty: %m");
1603                 goto finish;
1604         }
1605
1606         if (arg_network_veth) {
1607                 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1608                 if (netns_fd < 0) {
1609                         log_error("Failed to open network namespace fd: %m");
1610                         goto finish;
1611                 }
1612         }
1613
1614         if (access("/dev/kdbus/control", F_OK) >= 0) {
1615
1616                 if (arg_share_system) {
1617                         kdbus_domain = strdup("/dev/kdbus");
1618                         if (!kdbus_domain) {
1619                                 log_oom();
1620                                 goto finish;
1621                         }
1622                 } else {
1623                         const char *ns;
1624
1625                         ns = strappenda("machine-", arg_machine);
1626                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1627                         if (r < 0)
1628                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1629                         else
1630                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1631                 }
1632         }
1633
1634         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1635                 log_error("Failed to create kmsg socket pair: %m");
1636                 goto finish;
1637         }
1638
1639         sd_notify(0, "READY=1");
1640
1641         assert_se(sigemptyset(&mask) == 0);
1642         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1643         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1644
1645         for (;;) {
1646                 siginfo_t status;
1647
1648                 sync_fd = eventfd(0, EFD_CLOEXEC);
1649                 if (sync_fd < 0) {
1650                         log_error("Failed to create event fd: %m");
1651                         goto finish;
1652                 }
1653
1654                 pid = syscall(__NR_clone,
1655                               SIGCHLD|CLONE_NEWNS|
1656                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1657                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1658                 if (pid < 0) {
1659                         if (errno == EINVAL)
1660                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1661                         else
1662                                 log_error("clone() failed: %m");
1663
1664                         goto finish;
1665                 }
1666
1667                 if (pid == 0) {
1668                         /* child */
1669                         const char *home = NULL;
1670                         uid_t uid = (uid_t) -1;
1671                         gid_t gid = (gid_t) -1;
1672                         unsigned n_env = 2;
1673                         const char *envp[] = {
1674                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1675                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1676                                 NULL, /* TERM */
1677                                 NULL, /* HOME */
1678                                 NULL, /* USER */
1679                                 NULL, /* LOGNAME */
1680                                 NULL, /* container_uuid */
1681                                 NULL, /* LISTEN_FDS */
1682                                 NULL, /* LISTEN_PID */
1683                                 NULL
1684                         };
1685                         char **env_use;
1686                         eventfd_t x;
1687
1688                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1689                         if (envp[n_env])
1690                                 n_env ++;
1691
1692                         close_nointr_nofail(master);
1693                         master = -1;
1694
1695                         close_nointr(STDIN_FILENO);
1696                         close_nointr(STDOUT_FILENO);
1697                         close_nointr(STDERR_FILENO);
1698
1699                         close_nointr_nofail(kmsg_socket_pair[0]);
1700                         kmsg_socket_pair[0] = -1;
1701
1702                         reset_all_signal_handlers();
1703
1704                         assert_se(sigemptyset(&mask) == 0);
1705                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1706
1707                         k = open_terminal(console, O_RDWR);
1708                         if (k != STDIN_FILENO) {
1709                                 if (k >= 0) {
1710                                         close_nointr_nofail(k);
1711                                         k = -EINVAL;
1712                                 }
1713
1714                                 log_error("Failed to open console: %s", strerror(-k));
1715                                 goto child_fail;
1716                         }
1717
1718                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1719                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1720                                 log_error("Failed to duplicate console: %m");
1721                                 goto child_fail;
1722                         }
1723
1724                         if (setsid() < 0) {
1725                                 log_error("setsid() failed: %m");
1726                                 goto child_fail;
1727                         }
1728
1729                         if (reset_audit_loginuid() < 0)
1730                                 goto child_fail;
1731
1732                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1733                                 log_error("PR_SET_PDEATHSIG failed: %m");
1734                                 goto child_fail;
1735                         }
1736
1737                         /* Mark everything as slave, so that we still
1738                          * receive mounts from the real root, but don't
1739                          * propagate mounts to the real root. */
1740                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1741                                 log_error("MS_SLAVE|MS_REC failed: %m");
1742                                 goto child_fail;
1743                         }
1744
1745                         /* Turn directory into bind mount */
1746                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1747                                 log_error("Failed to make bind mount.");
1748                                 goto child_fail;
1749                         }
1750
1751                         if (arg_read_only)
1752                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1753                                         log_error("Failed to make read-only.");
1754                                         goto child_fail;
1755                                 }
1756
1757                         if (mount_all(arg_directory) < 0)
1758                                 goto child_fail;
1759
1760                         if (copy_devnodes(arg_directory) < 0)
1761                                 goto child_fail;
1762
1763                         if (setup_ptmx(arg_directory) < 0)
1764                                 goto child_fail;
1765
1766                         dev_setup(arg_directory);
1767
1768                         if (setup_veth(netns_fd) < 0)
1769                                 goto child_fail;
1770
1771                         if (netns_fd >= 0) {
1772                                 close_nointr_nofail(netns_fd);
1773                                 netns_fd = -1;
1774                         }
1775
1776                         if (audit_still_doesnt_work_in_containers() < 0)
1777                                 goto child_fail;
1778
1779                         if (setup_dev_console(arg_directory, console) < 0)
1780                                 goto child_fail;
1781
1782                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1783                                 goto child_fail;
1784
1785                         close_nointr_nofail(kmsg_socket_pair[1]);
1786                         kmsg_socket_pair[1] = -1;
1787
1788                         if (setup_boot_id(arg_directory) < 0)
1789                                 goto child_fail;
1790
1791                         if (setup_timezone(arg_directory) < 0)
1792                                 goto child_fail;
1793
1794                         if (setup_resolv_conf(arg_directory) < 0)
1795                                 goto child_fail;
1796
1797                         if (setup_journal(arg_directory) < 0)
1798                                 goto child_fail;
1799
1800                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1801                                 goto child_fail;
1802
1803                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1804                                 goto child_fail;
1805
1806                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1807                                 goto child_fail;
1808
1809                         if (chdir(arg_directory) < 0) {
1810                                 log_error("chdir(%s) failed: %m", arg_directory);
1811                                 goto child_fail;
1812                         }
1813
1814                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1815                                 log_error("mount(MS_MOVE) failed: %m");
1816                                 goto child_fail;
1817                         }
1818
1819                         if (chroot(".") < 0) {
1820                                 log_error("chroot() failed: %m");
1821                                 goto child_fail;
1822                         }
1823
1824                         if (chdir("/") < 0) {
1825                                 log_error("chdir() failed: %m");
1826                                 goto child_fail;
1827                         }
1828
1829                         umask(0022);
1830
1831                         if (arg_private_network)
1832                                 loopback_setup();
1833
1834                         if (drop_capabilities() < 0) {
1835                                 log_error("drop_capabilities() failed: %m");
1836                                 goto child_fail;
1837                         }
1838
1839                         if (arg_user) {
1840
1841                                 /* Note that this resolves user names
1842                                  * inside the container, and hence
1843                                  * accesses the NSS modules from the
1844                                  * container and not the host. This is
1845                                  * a bit weird... */
1846
1847                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1848                                         log_error("get_user_creds() failed: %m");
1849                                         goto child_fail;
1850                                 }
1851
1852                                 if (mkdir_parents_label(home, 0775) < 0) {
1853                                         log_error("mkdir_parents_label() failed: %m");
1854                                         goto child_fail;
1855                                 }
1856
1857                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1858                                         log_error("mkdir_safe_label() failed: %m");
1859                                         goto child_fail;
1860                                 }
1861
1862                                 if (initgroups((const char*)arg_user, gid) < 0) {
1863                                         log_error("initgroups() failed: %m");
1864                                         goto child_fail;
1865                                 }
1866
1867                                 if (setresgid(gid, gid, gid) < 0) {
1868                                         log_error("setregid() failed: %m");
1869                                         goto child_fail;
1870                                 }
1871
1872                                 if (setresuid(uid, uid, uid) < 0) {
1873                                         log_error("setreuid() failed: %m");
1874                                         goto child_fail;
1875                                 }
1876                         } else {
1877                                 /* Reset everything fully to 0, just in case */
1878
1879                                 if (setgroups(0, NULL) < 0) {
1880                                         log_error("setgroups() failed: %m");
1881                                         goto child_fail;
1882                                 }
1883
1884                                 if (setresgid(0, 0, 0) < 0) {
1885                                         log_error("setregid() failed: %m");
1886                                         goto child_fail;
1887                                 }
1888
1889                                 if (setresuid(0, 0, 0) < 0) {
1890                                         log_error("setreuid() failed: %m");
1891                                         goto child_fail;
1892                                 }
1893                         }
1894
1895                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1896                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1897                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1898                                 log_oom();
1899                                 goto child_fail;
1900                         }
1901
1902                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1903                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1904                                         log_oom();
1905                                         goto child_fail;
1906                                 }
1907                         }
1908
1909                         if (fdset_size(fds) > 0) {
1910                                 k = fdset_cloexec(fds, false);
1911                                 if (k < 0) {
1912                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1913                                         goto child_fail;
1914                                 }
1915
1916                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1917                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1918                                         log_oom();
1919                                         goto child_fail;
1920                                 }
1921                         }
1922
1923                         setup_hostname();
1924
1925                         eventfd_read(sync_fd, &x);
1926                         close_nointr_nofail(sync_fd);
1927                         sync_fd = -1;
1928
1929                         if (!strv_isempty(arg_setenv)) {
1930                                 char **n;
1931
1932                                 n = strv_env_merge(2, envp, arg_setenv);
1933                                 if (!n) {
1934                                         log_oom();
1935                                         goto child_fail;
1936                                 }
1937
1938                                 env_use = n;
1939                         } else
1940                                 env_use = (char**) envp;
1941
1942 #ifdef HAVE_SELINUX
1943                         if (arg_selinux_context)
1944                                 if (setexeccon(arg_selinux_context) < 0)
1945                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1946 #endif
1947                         if (arg_boot) {
1948                                 char **a;
1949                                 size_t l;
1950
1951                                 /* Automatically search for the init system */
1952
1953                                 l = 1 + argc - optind;
1954                                 a = newa(char*, l + 1);
1955                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1956
1957                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1958                                 execve(a[0], a, env_use);
1959
1960                                 a[0] = (char*) "/lib/systemd/systemd";
1961                                 execve(a[0], a, env_use);
1962
1963                                 a[0] = (char*) "/sbin/init";
1964                                 execve(a[0], a, env_use);
1965                         } else if (argc > optind)
1966                                 execvpe(argv[optind], argv + optind, env_use);
1967                         else {
1968                                 chdir(home ? home : "/root");
1969                                 execle("/bin/bash", "-bash", NULL, env_use);
1970                                 execle("/bin/sh", "-sh", NULL, env_use);
1971                         }
1972
1973                         log_error("execv() failed: %m");
1974
1975                 child_fail:
1976                         _exit(EXIT_FAILURE);
1977                 }
1978
1979                 fdset_free(fds);
1980                 fds = NULL;
1981
1982                 r = register_machine(pid);
1983                 if (r < 0)
1984                         goto finish;
1985
1986                 r = move_network_interfaces(pid);
1987                 if (r < 0)
1988                         goto finish;
1989
1990                 eventfd_write(sync_fd, 1);
1991                 close_nointr_nofail(sync_fd);
1992                 sync_fd = -1;
1993
1994                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1995                 if (k < 0) {
1996                         r = EXIT_FAILURE;
1997                         break;
1998                 }
1999
2000                 if (!arg_quiet)
2001                         putc('\n', stdout);
2002
2003                 /* Kill if it is not dead yet anyway */
2004                 terminate_machine(pid);
2005
2006                 /* Redundant, but better safe than sorry */
2007                 kill(pid, SIGKILL);
2008
2009                 k = wait_for_terminate(pid, &status);
2010                 pid = 0;
2011
2012                 if (k < 0) {
2013                         r = EXIT_FAILURE;
2014                         break;
2015                 }
2016
2017                 if (status.si_code == CLD_EXITED) {
2018                         r = status.si_status;
2019                         if (status.si_status != 0) {
2020                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2021                                 break;
2022                         }
2023
2024                         if (!arg_quiet)
2025                                 log_debug("Container %s exited successfully.", arg_machine);
2026                         break;
2027                 } else if (status.si_code == CLD_KILLED &&
2028                            status.si_status == SIGINT) {
2029
2030                         if (!arg_quiet)
2031                                 log_info("Container %s has been shut down.", arg_machine);
2032                         r = 0;
2033                         break;
2034                 } else if (status.si_code == CLD_KILLED &&
2035                            status.si_status == SIGHUP) {
2036
2037                         if (!arg_quiet)
2038                                 log_info("Container %s is being rebooted.", arg_machine);
2039                         continue;
2040                 } else if (status.si_code == CLD_KILLED ||
2041                            status.si_code == CLD_DUMPED) {
2042
2043                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2044                         r = EXIT_FAILURE;
2045                         break;
2046                 } else {
2047                         log_error("Container %s failed due to unknown reason.", arg_machine);
2048                         r = EXIT_FAILURE;
2049                         break;
2050                 }
2051         }
2052
2053 finish:
2054         if (pid > 0)
2055                 kill(pid, SIGKILL);
2056
2057         free(arg_directory);
2058         free(arg_machine);
2059         free(arg_setenv);
2060         free(arg_network_interfaces);
2061
2062         return r;
2063 }