chiark / gitweb /
fcc0f17aee011a9542262bd0d18177d6dcdadf62
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46 #include <linux/veth.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #include "sd-daemon.h"
57 #include "sd-bus.h"
58 #include "sd-id128.h"
59 #include "sd-rtnl.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "macro.h"
64 #include "audit.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "bus-kernel.h"
78 #include "env-util.h"
79 #include "def.h"
80 #include "rtnl-util.h"
81 #include "udev-util.h"
82
83 typedef enum LinkJournal {
84         LINK_NO,
85         LINK_AUTO,
86         LINK_HOST,
87         LINK_GUEST
88 } LinkJournal;
89
90 static char *arg_directory = NULL;
91 static char *arg_user = NULL;
92 static sd_id128_t arg_uuid = {};
93 static char *arg_machine = NULL;
94 static char *arg_selinux_context = NULL;
95 static char *arg_selinux_apifs_context = NULL;
96 static const char *arg_slice = NULL;
97 static bool arg_private_network = false;
98 static bool arg_read_only = false;
99 static bool arg_boot = false;
100 static LinkJournal arg_link_journal = LINK_AUTO;
101 static uint64_t arg_retain =
102         (1ULL << CAP_CHOWN) |
103         (1ULL << CAP_DAC_OVERRIDE) |
104         (1ULL << CAP_DAC_READ_SEARCH) |
105         (1ULL << CAP_FOWNER) |
106         (1ULL << CAP_FSETID) |
107         (1ULL << CAP_IPC_OWNER) |
108         (1ULL << CAP_KILL) |
109         (1ULL << CAP_LEASE) |
110         (1ULL << CAP_LINUX_IMMUTABLE) |
111         (1ULL << CAP_NET_BIND_SERVICE) |
112         (1ULL << CAP_NET_BROADCAST) |
113         (1ULL << CAP_NET_RAW) |
114         (1ULL << CAP_SETGID) |
115         (1ULL << CAP_SETFCAP) |
116         (1ULL << CAP_SETPCAP) |
117         (1ULL << CAP_SETUID) |
118         (1ULL << CAP_SYS_ADMIN) |
119         (1ULL << CAP_SYS_CHROOT) |
120         (1ULL << CAP_SYS_NICE) |
121         (1ULL << CAP_SYS_PTRACE) |
122         (1ULL << CAP_SYS_TTY_CONFIG) |
123         (1ULL << CAP_SYS_RESOURCE) |
124         (1ULL << CAP_SYS_BOOT) |
125         (1ULL << CAP_AUDIT_WRITE) |
126         (1ULL << CAP_AUDIT_CONTROL) |
127         (1ULL << CAP_MKNOD);
128 static char **arg_bind = NULL;
129 static char **arg_bind_ro = NULL;
130 static char **arg_setenv = NULL;
131 static bool arg_quiet = false;
132 static bool arg_share_system = false;
133 static bool arg_register = true;
134 static bool arg_keep_unit = false;
135 static char **arg_network_interfaces = NULL;
136 static bool arg_network_veth = false;
137
138 static int help(void) {
139
140         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
141                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
142                "  -h --help                 Show this help\n"
143                "     --version              Print version string\n"
144                "  -q --quiet                Do not show status information\n"
145                "  -D --directory=NAME       Root directory for the container\n"
146                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
147                "  -u --user=USER            Run the command under specified user or uid\n"
148                "  -M --machine=NAME         Set the machine name for the container\n"
149                "     --uuid=UUID            Set a specific machine UUID for the container\n"
150                "  -S --slice=SLICE          Place the container in the specified slice\n"
151                "     --private-network      Disable network in container\n"
152                "     --network-interface=INTERFACE\n"
153                "                            Assign an existing network interface to the\n"
154                "                            container\n"
155                "     --network-veth         Add a a virtual ethernet connection between host\n"
156                "                            and container\n"
157                "  -Z --selinux-context=SECLABEL\n"
158                "                            Set the SELinux security context to be used by\n"
159                "                            processes in the container\n"
160                "  -L --selinux-apifs-context=SECLABEL\n"
161                "                            Set the SELinux security context to be used by\n"
162                "                            API/tmpfs file systems in the container\n"
163                "     --capability=CAP       In addition to the default, retain specified\n"
164                "                            capability\n"
165                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
166                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
167                "  -j                        Equivalent to --link-journal=host\n"
168                "     --read-only            Mount the root directory read-only\n"
169                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
170                "                            the container\n"
171                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
172                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
173                "     --share-system         Share system namespaces with host\n"
174                "     --register=BOOLEAN     Register container as machine\n"
175                "     --keep-unit            Do not register a scope for the machine, reuse\n"
176                "                            the service unit nspawn is running in\n",
177                program_invocation_short_name);
178
179         return 0;
180 }
181
182 static int parse_argv(int argc, char *argv[]) {
183
184         enum {
185                 ARG_VERSION = 0x100,
186                 ARG_PRIVATE_NETWORK,
187                 ARG_UUID,
188                 ARG_READ_ONLY,
189                 ARG_CAPABILITY,
190                 ARG_DROP_CAPABILITY,
191                 ARG_LINK_JOURNAL,
192                 ARG_BIND,
193                 ARG_BIND_RO,
194                 ARG_SETENV,
195                 ARG_SHARE_SYSTEM,
196                 ARG_REGISTER,
197                 ARG_KEEP_UNIT,
198                 ARG_NETWORK_INTERFACE,
199                 ARG_NETWORK_VETH,
200         };
201
202         static const struct option options[] = {
203                 { "help",                  no_argument,       NULL, 'h'                   },
204                 { "version",               no_argument,       NULL, ARG_VERSION           },
205                 { "directory",             required_argument, NULL, 'D'                   },
206                 { "user",                  required_argument, NULL, 'u'                   },
207                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
208                 { "boot",                  no_argument,       NULL, 'b'                   },
209                 { "uuid",                  required_argument, NULL, ARG_UUID              },
210                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
211                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
212                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
213                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
214                 { "bind",                  required_argument, NULL, ARG_BIND              },
215                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
216                 { "machine",               required_argument, NULL, 'M'                   },
217                 { "slice",                 required_argument, NULL, 'S'                   },
218                 { "setenv",                required_argument, NULL, ARG_SETENV            },
219                 { "selinux-context",       required_argument, NULL, 'Z'                   },
220                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
221                 { "quiet",                 no_argument,       NULL, 'q'                   },
222                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
223                 { "register",              required_argument, NULL, ARG_REGISTER          },
224                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
225                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
226                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH   },
227                 {}
228         };
229
230         int c, r;
231         uint64_t plus = 0, minus = 0;
232
233         assert(argc >= 0);
234         assert(argv);
235
236         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
237
238                 switch (c) {
239
240                 case 'h':
241                         return help();
242
243                 case ARG_VERSION:
244                         puts(PACKAGE_STRING);
245                         puts(SYSTEMD_FEATURES);
246                         return 0;
247
248                 case 'D':
249                         free(arg_directory);
250                         arg_directory = canonicalize_file_name(optarg);
251                         if (!arg_directory) {
252                                 log_error("Invalid root directory: %m");
253                                 return -ENOMEM;
254                         }
255
256                         break;
257
258                 case 'u':
259                         free(arg_user);
260                         arg_user = strdup(optarg);
261                         if (!arg_user)
262                                 return log_oom();
263
264                         break;
265
266                 case ARG_NETWORK_VETH:
267                         arg_network_veth = true;
268                         arg_private_network = true;
269                         break;
270
271                 case ARG_NETWORK_INTERFACE:
272                         if (strv_push(&arg_network_interfaces, optarg) < 0)
273                                 return log_oom();
274
275                         /* fall through */
276
277                 case ARG_PRIVATE_NETWORK:
278                         arg_private_network = true;
279                         break;
280
281                 case 'b':
282                         arg_boot = true;
283                         break;
284
285                 case ARG_UUID:
286                         r = sd_id128_from_string(optarg, &arg_uuid);
287                         if (r < 0) {
288                                 log_error("Invalid UUID: %s", optarg);
289                                 return r;
290                         }
291                         break;
292
293                 case 'S':
294                         arg_slice = strdup(optarg);
295                         if (!arg_slice)
296                                 return log_oom();
297
298                         break;
299
300                 case 'M':
301                         if (isempty(optarg)) {
302                                 free(arg_machine);
303                                 arg_machine = NULL;
304                         } else {
305
306                                 if (!hostname_is_valid(optarg)) {
307                                         log_error("Invalid machine name: %s", optarg);
308                                         return -EINVAL;
309                                 }
310
311                                 free(arg_machine);
312                                 arg_machine = strdup(optarg);
313                                 if (!arg_machine)
314                                         return log_oom();
315
316                                 break;
317                         }
318
319                 case 'Z':
320                         arg_selinux_context = optarg;
321                         break;
322
323                 case 'L':
324                         arg_selinux_apifs_context = optarg;
325                         break;
326
327                 case ARG_READ_ONLY:
328                         arg_read_only = true;
329                         break;
330
331                 case ARG_CAPABILITY:
332                 case ARG_DROP_CAPABILITY: {
333                         char *state, *word;
334                         size_t length;
335
336                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
337                                 _cleanup_free_ char *t;
338                                 cap_value_t cap;
339
340                                 t = strndup(word, length);
341                                 if (!t)
342                                         return log_oom();
343
344                                 if (streq(t, "all")) {
345                                         if (c == ARG_CAPABILITY)
346                                                 plus = (uint64_t) -1;
347                                         else
348                                                 minus = (uint64_t) -1;
349                                 } else {
350                                         if (cap_from_name(t, &cap) < 0) {
351                                                 log_error("Failed to parse capability %s.", t);
352                                                 return -EINVAL;
353                                         }
354
355                                         if (c == ARG_CAPABILITY)
356                                                 plus |= 1ULL << (uint64_t) cap;
357                                         else
358                                                 minus |= 1ULL << (uint64_t) cap;
359                                 }
360                         }
361
362                         break;
363                 }
364
365                 case 'j':
366                         arg_link_journal = LINK_GUEST;
367                         break;
368
369                 case ARG_LINK_JOURNAL:
370                         if (streq(optarg, "auto"))
371                                 arg_link_journal = LINK_AUTO;
372                         else if (streq(optarg, "no"))
373                                 arg_link_journal = LINK_NO;
374                         else if (streq(optarg, "guest"))
375                                 arg_link_journal = LINK_GUEST;
376                         else if (streq(optarg, "host"))
377                                 arg_link_journal = LINK_HOST;
378                         else {
379                                 log_error("Failed to parse link journal mode %s", optarg);
380                                 return -EINVAL;
381                         }
382
383                         break;
384
385                 case ARG_BIND:
386                 case ARG_BIND_RO: {
387                         _cleanup_free_ char *a = NULL, *b = NULL;
388                         char *e;
389                         char ***x;
390
391                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
392
393                         e = strchr(optarg, ':');
394                         if (e) {
395                                 a = strndup(optarg, e - optarg);
396                                 b = strdup(e + 1);
397                         } else {
398                                 a = strdup(optarg);
399                                 b = strdup(optarg);
400                         }
401
402                         if (!a || !b)
403                                 return log_oom();
404
405                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
406                                 log_error("Invalid bind mount specification: %s", optarg);
407                                 return -EINVAL;
408                         }
409
410                         r = strv_extend(x, a);
411                         if (r < 0)
412                                 return log_oom();
413
414                         r = strv_extend(x, b);
415                         if (r < 0)
416                                 return log_oom();
417
418                         break;
419                 }
420
421                 case ARG_SETENV: {
422                         char **n;
423
424                         if (!env_assignment_is_valid(optarg)) {
425                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
426                                 return -EINVAL;
427                         }
428
429                         n = strv_env_set(arg_setenv, optarg);
430                         if (!n)
431                                 return log_oom();
432
433                         strv_free(arg_setenv);
434                         arg_setenv = n;
435                         break;
436                 }
437
438                 case 'q':
439                         arg_quiet = true;
440                         break;
441
442                 case ARG_SHARE_SYSTEM:
443                         arg_share_system = true;
444                         break;
445
446                 case ARG_REGISTER:
447                         r = parse_boolean(optarg);
448                         if (r < 0) {
449                                 log_error("Failed to parse --register= argument: %s", optarg);
450                                 return r;
451                         }
452
453                         arg_register = r;
454                         break;
455
456                 case ARG_KEEP_UNIT:
457                         arg_keep_unit = true;
458                         break;
459
460                 case '?':
461                         return -EINVAL;
462
463                 default:
464                         assert_not_reached("Unhandled option");
465                 }
466         }
467
468         if (arg_share_system)
469                 arg_register = false;
470
471         if (arg_boot && arg_share_system) {
472                 log_error("--boot and --share-system may not be combined.");
473                 return -EINVAL;
474         }
475
476         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
477                 log_error("--keep-unit may not be used when invoked from a user session.");
478                 return -EINVAL;
479         }
480
481         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
482
483         return 1;
484 }
485
486 static int mount_all(const char *dest) {
487
488         typedef struct MountPoint {
489                 const char *what;
490                 const char *where;
491                 const char *type;
492                 const char *options;
493                 unsigned long flags;
494                 bool fatal;
495         } MountPoint;
496
497         static const MountPoint mount_table[] = {
498                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
499                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
500                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
501                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
502                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
503                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
504                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
505                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
506 #ifdef HAVE_SELINUX
507                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
508                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
509 #endif
510         };
511
512         unsigned k;
513         int r = 0;
514
515         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
516                 _cleanup_free_ char *where = NULL;
517 #ifdef HAVE_SELINUX
518                 _cleanup_free_ char *options = NULL;
519 #endif
520                 const char *o;
521                 int t;
522
523                 where = strjoin(dest, "/", mount_table[k].where, NULL);
524                 if (!where)
525                         return log_oom();
526
527                 t = path_is_mount_point(where, true);
528                 if (t < 0) {
529                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
530
531                         if (r == 0)
532                                 r = t;
533
534                         continue;
535                 }
536
537                 /* Skip this entry if it is not a remount. */
538                 if (mount_table[k].what && t > 0)
539                         continue;
540
541                 mkdir_p(where, 0755);
542
543 #ifdef HAVE_SELINUX
544                 if (arg_selinux_apifs_context &&
545                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
546                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
547                         if (!options)
548                                 return log_oom();
549
550                         o = options;
551                 } else
552 #endif
553                         o = mount_table[k].options;
554
555
556                 if (mount(mount_table[k].what,
557                           where,
558                           mount_table[k].type,
559                           mount_table[k].flags,
560                           o) < 0 &&
561                     mount_table[k].fatal) {
562
563                         log_error("mount(%s) failed: %m", where);
564
565                         if (r == 0)
566                                 r = -errno;
567                 }
568         }
569
570         return r;
571 }
572
573 static int mount_binds(const char *dest, char **l, unsigned long flags) {
574         char **x, **y;
575
576         STRV_FOREACH_PAIR(x, y, l) {
577                 char *where;
578                 struct stat source_st, dest_st;
579                 int r;
580
581                 if (stat(*x, &source_st) < 0) {
582                         log_error("failed to stat %s: %m", *x);
583                         return -errno;
584                 }
585
586                 where = strappenda(dest, *y);
587                 r = stat(where, &dest_st);
588                 if (r == 0) {
589                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
590                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
591                                                 *x, where);
592                                 return -EINVAL;
593                         }
594                 } else if (errno == ENOENT) {
595                         r = mkdir_parents_label(where, 0755);
596                         if (r < 0) {
597                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
598                                 return r;
599                         }
600                 } else {
601                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
602                         return -errno;
603                 }
604                 /* Create the mount point, but be conservative -- refuse to create block
605                 * and char devices. */
606                 if (S_ISDIR(source_st.st_mode))
607                         mkdir_label(where, 0755);
608                 else if (S_ISFIFO(source_st.st_mode))
609                         mkfifo(where, 0644);
610                 else if (S_ISSOCK(source_st.st_mode))
611                         mknod(where, 0644 | S_IFSOCK, 0);
612                 else if (S_ISREG(source_st.st_mode))
613                         touch(where);
614                 else {
615                         log_error("Refusing to create mountpoint for file: %s", *x);
616                         return -ENOTSUP;
617                 }
618
619                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
620                         log_error("mount(%s) failed: %m", where);
621                         return -errno;
622                 }
623
624                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
625                         log_error("mount(%s) failed: %m", where);
626                         return -errno;
627                 }
628         }
629
630         return 0;
631 }
632
633 static int setup_timezone(const char *dest) {
634         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
635         char *z, *y;
636         int r;
637
638         assert(dest);
639
640         /* Fix the timezone, if possible */
641         r = readlink_malloc("/etc/localtime", &p);
642         if (r < 0) {
643                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
644                 return 0;
645         }
646
647         z = path_startswith(p, "../usr/share/zoneinfo/");
648         if (!z)
649                 z = path_startswith(p, "/usr/share/zoneinfo/");
650         if (!z) {
651                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
652                 return 0;
653         }
654
655         where = strappend(dest, "/etc/localtime");
656         if (!where)
657                 return log_oom();
658
659         r = readlink_malloc(where, &q);
660         if (r >= 0) {
661                 y = path_startswith(q, "../usr/share/zoneinfo/");
662                 if (!y)
663                         y = path_startswith(q, "/usr/share/zoneinfo/");
664
665
666                 /* Already pointing to the right place? Then do nothing .. */
667                 if (y && streq(y, z))
668                         return 0;
669         }
670
671         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
672         if (!check)
673                 return log_oom();
674
675         if (access(check, F_OK) < 0) {
676                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
677                 return 0;
678         }
679
680         what = strappend("../usr/share/zoneinfo/", z);
681         if (!what)
682                 return log_oom();
683
684         unlink(where);
685         if (symlink(what, where) < 0) {
686                 log_error("Failed to correct timezone of container: %m");
687                 return 0;
688         }
689
690         return 0;
691 }
692
693 static int setup_resolv_conf(const char *dest) {
694         char _cleanup_free_ *where = NULL;
695
696         assert(dest);
697
698         if (arg_private_network)
699                 return 0;
700
701         /* Fix resolv.conf, if possible */
702         where = strappend(dest, "/etc/resolv.conf");
703         if (!where)
704                 return log_oom();
705
706         /* We don't really care for the results of this really. If it
707          * fails, it fails, but meh... */
708         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
709
710         return 0;
711 }
712
713 static int setup_boot_id(const char *dest) {
714         _cleanup_free_ char *from = NULL, *to = NULL;
715         sd_id128_t rnd;
716         char as_uuid[37];
717         int r;
718
719         assert(dest);
720
721         if (arg_share_system)
722                 return 0;
723
724         /* Generate a new randomized boot ID, so that each boot-up of
725          * the container gets a new one */
726
727         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
728         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
729         if (!from || !to)
730                 return log_oom();
731
732         r = sd_id128_randomize(&rnd);
733         if (r < 0) {
734                 log_error("Failed to generate random boot id: %s", strerror(-r));
735                 return r;
736         }
737
738         snprintf(as_uuid, sizeof(as_uuid),
739                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
740                  SD_ID128_FORMAT_VAL(rnd));
741         char_array_0(as_uuid);
742
743         r = write_string_file(from, as_uuid);
744         if (r < 0) {
745                 log_error("Failed to write boot id: %s", strerror(-r));
746                 return r;
747         }
748
749         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750                 log_error("Failed to bind mount boot id: %m");
751                 r = -errno;
752         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
753                 log_warning("Failed to make boot id read-only: %m");
754
755         unlink(from);
756         return r;
757 }
758
759 static int copy_devnodes(const char *dest) {
760
761         static const char devnodes[] =
762                 "null\0"
763                 "zero\0"
764                 "full\0"
765                 "random\0"
766                 "urandom\0"
767                 "tty\0";
768
769         const char *d;
770         int r = 0;
771         _cleanup_umask_ mode_t u;
772
773         assert(dest);
774
775         u = umask(0000);
776
777         NULSTR_FOREACH(d, devnodes) {
778                 _cleanup_free_ char *from = NULL, *to = NULL;
779                 struct stat st;
780
781                 from = strappend("/dev/", d);
782                 to = strjoin(dest, "/dev/", d, NULL);
783                 if (!from || !to)
784                         return log_oom();
785
786                 if (stat(from, &st) < 0) {
787
788                         if (errno != ENOENT) {
789                                 log_error("Failed to stat %s: %m", from);
790                                 return -errno;
791                         }
792
793                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
794
795                         log_error("%s is not a char or block device, cannot copy", from);
796                         return -EIO;
797
798                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
799
800                         log_error("mknod(%s) failed: %m", dest);
801                         return  -errno;
802                 }
803         }
804
805         return r;
806 }
807
808 static int setup_ptmx(const char *dest) {
809         _cleanup_free_ char *p = NULL;
810
811         p = strappend(dest, "/dev/ptmx");
812         if (!p)
813                 return log_oom();
814
815         if (symlink("pts/ptmx", p) < 0) {
816                 log_error("Failed to create /dev/ptmx symlink: %m");
817                 return -errno;
818         }
819
820         return 0;
821 }
822
823 static int setup_dev_console(const char *dest, const char *console) {
824         struct stat st;
825         _cleanup_free_ char *to = NULL;
826         int r;
827         _cleanup_umask_ mode_t u;
828
829         assert(dest);
830         assert(console);
831
832         u = umask(0000);
833
834         if (stat(console, &st) < 0) {
835                 log_error("Failed to stat %s: %m", console);
836                 return -errno;
837
838         } else if (!S_ISCHR(st.st_mode)) {
839                 log_error("/dev/console is not a char device");
840                 return -EIO;
841         }
842
843         r = chmod_and_chown(console, 0600, 0, 0);
844         if (r < 0) {
845                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
846                 return r;
847         }
848
849         if (asprintf(&to, "%s/dev/console", dest) < 0)
850                 return log_oom();
851
852         /* We need to bind mount the right tty to /dev/console since
853          * ptys can only exist on pts file systems. To have something
854          * to bind mount things on we create a device node first, that
855          * has the right major/minor (note that the major minor
856          * doesn't actually matter here, since we mount it over
857          * anyway). */
858
859         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
860                 log_error("mknod() for /dev/console failed: %m");
861                 return -errno;
862         }
863
864         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
865                 log_error("Bind mount for /dev/console failed: %m");
866                 return -errno;
867         }
868
869         return 0;
870 }
871
872 static int setup_kmsg(const char *dest, int kmsg_socket) {
873         _cleanup_free_ char *from = NULL, *to = NULL;
874         int r, fd, k;
875         _cleanup_umask_ mode_t u;
876         union {
877                 struct cmsghdr cmsghdr;
878                 uint8_t buf[CMSG_SPACE(sizeof(int))];
879         } control = {};
880         struct msghdr mh = {
881                 .msg_control = &control,
882                 .msg_controllen = sizeof(control),
883         };
884         struct cmsghdr *cmsg;
885
886         assert(dest);
887         assert(kmsg_socket >= 0);
888
889         u = umask(0000);
890
891         /* We create the kmsg FIFO as /dev/kmsg, but immediately
892          * delete it after bind mounting it to /proc/kmsg. While FIFOs
893          * on the reading side behave very similar to /proc/kmsg,
894          * their writing side behaves differently from /dev/kmsg in
895          * that writing blocks when nothing is reading. In order to
896          * avoid any problems with containers deadlocking due to this
897          * we simply make /dev/kmsg unavailable to the container. */
898         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
899             asprintf(&to, "%s/proc/kmsg", dest) < 0)
900                 return log_oom();
901
902         if (mkfifo(from, 0600) < 0) {
903                 log_error("mkfifo() for /dev/kmsg failed: %m");
904                 return -errno;
905         }
906
907         r = chmod_and_chown(from, 0600, 0, 0);
908         if (r < 0) {
909                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
910                 return r;
911         }
912
913         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
914                 log_error("Bind mount for /proc/kmsg failed: %m");
915                 return -errno;
916         }
917
918         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
919         if (fd < 0) {
920                 log_error("Failed to open fifo: %m");
921                 return -errno;
922         }
923
924         cmsg = CMSG_FIRSTHDR(&mh);
925         cmsg->cmsg_level = SOL_SOCKET;
926         cmsg->cmsg_type = SCM_RIGHTS;
927         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
928         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
929
930         mh.msg_controllen = cmsg->cmsg_len;
931
932         /* Store away the fd in the socket, so that it stays open as
933          * long as we run the child */
934         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
935         close_nointr_nofail(fd);
936
937         if (k < 0) {
938                 log_error("Failed to send FIFO fd: %m");
939                 return -errno;
940         }
941
942         /* And now make the FIFO unavailable as /dev/kmsg... */
943         unlink(from);
944         return 0;
945 }
946
947 static int setup_hostname(void) {
948
949         if (arg_share_system)
950                 return 0;
951
952         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
953                 return -errno;
954
955         return 0;
956 }
957
958 static int setup_journal(const char *directory) {
959         sd_id128_t machine_id, this_id;
960         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
961         char *id;
962         int r;
963
964         p = strappend(directory, "/etc/machine-id");
965         if (!p)
966                 return log_oom();
967
968         r = read_one_line_file(p, &b);
969         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
970                 return 0;
971         else if (r < 0) {
972                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
973                 return r;
974         }
975
976         id = strstrip(b);
977         if (isempty(id) && arg_link_journal == LINK_AUTO)
978                 return 0;
979
980         /* Verify validity */
981         r = sd_id128_from_string(id, &machine_id);
982         if (r < 0) {
983                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
984                 return r;
985         }
986
987         r = sd_id128_get_machine(&this_id);
988         if (r < 0) {
989                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
990                 return r;
991         }
992
993         if (sd_id128_equal(machine_id, this_id)) {
994                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
995                          "Host and machine ids are equal (%s): refusing to link journals", id);
996                 if (arg_link_journal == LINK_AUTO)
997                         return 0;
998                 return
999                         -EEXIST;
1000         }
1001
1002         if (arg_link_journal == LINK_NO)
1003                 return 0;
1004
1005         free(p);
1006         p = strappend("/var/log/journal/", id);
1007         q = strjoin(directory, "/var/log/journal/", id, NULL);
1008         if (!p || !q)
1009                 return log_oom();
1010
1011         if (path_is_mount_point(p, false) > 0) {
1012                 if (arg_link_journal != LINK_AUTO) {
1013                         log_error("%s: already a mount point, refusing to use for journal", p);
1014                         return -EEXIST;
1015                 }
1016
1017                 return 0;
1018         }
1019
1020         if (path_is_mount_point(q, false) > 0) {
1021                 if (arg_link_journal != LINK_AUTO) {
1022                         log_error("%s: already a mount point, refusing to use for journal", q);
1023                         return -EEXIST;
1024                 }
1025
1026                 return 0;
1027         }
1028
1029         r = readlink_and_make_absolute(p, &d);
1030         if (r >= 0) {
1031                 if ((arg_link_journal == LINK_GUEST ||
1032                      arg_link_journal == LINK_AUTO) &&
1033                     path_equal(d, q)) {
1034
1035                         r = mkdir_p(q, 0755);
1036                         if (r < 0)
1037                                 log_warning("failed to create directory %s: %m", q);
1038                         return 0;
1039                 }
1040
1041                 if (unlink(p) < 0) {
1042                         log_error("Failed to remove symlink %s: %m", p);
1043                         return -errno;
1044                 }
1045         } else if (r == -EINVAL) {
1046
1047                 if (arg_link_journal == LINK_GUEST &&
1048                     rmdir(p) < 0) {
1049
1050                         if (errno == ENOTDIR) {
1051                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1052                                 return r;
1053                         } else {
1054                                 log_error("Failed to remove %s: %m", p);
1055                                 return -errno;
1056                         }
1057                 }
1058         } else if (r != -ENOENT) {
1059                 log_error("readlink(%s) failed: %m", p);
1060                 return r;
1061         }
1062
1063         if (arg_link_journal == LINK_GUEST) {
1064
1065                 if (symlink(q, p) < 0) {
1066                         log_error("Failed to symlink %s to %s: %m", q, p);
1067                         return -errno;
1068                 }
1069
1070                 r = mkdir_p(q, 0755);
1071                 if (r < 0)
1072                         log_warning("failed to create directory %s: %m", q);
1073                 return 0;
1074         }
1075
1076         if (arg_link_journal == LINK_HOST) {
1077                 r = mkdir_p(p, 0755);
1078                 if (r < 0) {
1079                         log_error("Failed to create %s: %m", p);
1080                         return r;
1081                 }
1082
1083         } else if (access(p, F_OK) < 0)
1084                 return 0;
1085
1086         if (dir_is_empty(q) == 0) {
1087                 log_error("%s not empty.", q);
1088                 return -ENOTEMPTY;
1089         }
1090
1091         r = mkdir_p(q, 0755);
1092         if (r < 0) {
1093                 log_error("Failed to create %s: %m", q);
1094                 return r;
1095         }
1096
1097         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1098                 log_error("Failed to bind mount journal from host into guest: %m");
1099                 return -errno;
1100         }
1101
1102         return 0;
1103 }
1104
1105 static int setup_kdbus(const char *dest, const char *path) {
1106         const char *p;
1107
1108         if (!path)
1109                 return 0;
1110
1111         p = strappenda(dest, "/dev/kdbus");
1112         if (mkdir(p, 0755) < 0) {
1113                 log_error("Failed to create kdbus path: %m");
1114                 return  -errno;
1115         }
1116
1117         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1118                 log_error("Failed to mount kdbus domain path: %m");
1119                 return -errno;
1120         }
1121
1122         return 0;
1123 }
1124
1125 static int drop_capabilities(void) {
1126         return capability_bounding_set_drop(~arg_retain, false);
1127 }
1128
1129 static int register_machine(pid_t pid) {
1130         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1131         _cleanup_bus_unref_ sd_bus *bus = NULL;
1132         int r;
1133
1134         if (!arg_register)
1135                 return 0;
1136
1137         r = sd_bus_default_system(&bus);
1138         if (r < 0) {
1139                 log_error("Failed to open system bus: %s", strerror(-r));
1140                 return r;
1141         }
1142
1143         if (arg_keep_unit) {
1144                 r = sd_bus_call_method(
1145                                 bus,
1146                                 "org.freedesktop.machine1",
1147                                 "/org/freedesktop/machine1",
1148                                 "org.freedesktop.machine1.Manager",
1149                                 "RegisterMachine",
1150                                 &error,
1151                                 NULL,
1152                                 "sayssus",
1153                                 arg_machine,
1154                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1155                                 "nspawn",
1156                                 "container",
1157                                 (uint32_t) pid,
1158                                 strempty(arg_directory));
1159         } else {
1160                 r = sd_bus_call_method(
1161                                 bus,
1162                                 "org.freedesktop.machine1",
1163                                 "/org/freedesktop/machine1",
1164                                 "org.freedesktop.machine1.Manager",
1165                                 "CreateMachine",
1166                                 &error,
1167                                 NULL,
1168                                 "sayssusa(sv)",
1169                                 arg_machine,
1170                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1171                                 "nspawn",
1172                                 "container",
1173                                 (uint32_t) pid,
1174                                 strempty(arg_directory),
1175                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1176         }
1177
1178         if (r < 0) {
1179                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1180                 return r;
1181         }
1182
1183         return 0;
1184 }
1185
1186 static int terminate_machine(pid_t pid) {
1187         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1188         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1189         _cleanup_bus_unref_ sd_bus *bus = NULL;
1190         const char *path;
1191         int r;
1192
1193         if (!arg_register)
1194                 return 0;
1195
1196         r = sd_bus_default_system(&bus);
1197         if (r < 0) {
1198                 log_error("Failed to open system bus: %s", strerror(-r));
1199                 return r;
1200         }
1201
1202         r = sd_bus_call_method(
1203                         bus,
1204                         "org.freedesktop.machine1",
1205                         "/org/freedesktop/machine1",
1206                         "org.freedesktop.machine1.Manager",
1207                         "GetMachineByPID",
1208                         &error,
1209                         &reply,
1210                         "u",
1211                         (uint32_t) pid);
1212         if (r < 0) {
1213                 /* Note that the machine might already have been
1214                  * cleaned up automatically, hence don't consider it a
1215                  * failure if we cannot get the machine object. */
1216                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1217                 return 0;
1218         }
1219
1220         r = sd_bus_message_read(reply, "o", &path);
1221         if (r < 0)
1222                 return bus_log_parse_error(r);
1223
1224         r = sd_bus_call_method(
1225                         bus,
1226                         "org.freedesktop.machine1",
1227                         path,
1228                         "org.freedesktop.machine1.Machine",
1229                         "Terminate",
1230                         &error,
1231                         NULL,
1232                         NULL);
1233         if (r < 0) {
1234                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1235                 return 0;
1236         }
1237
1238         return 0;
1239 }
1240
1241 static int reset_audit_loginuid(void) {
1242         _cleanup_free_ char *p = NULL;
1243         int r;
1244
1245         if (arg_share_system)
1246                 return 0;
1247
1248         r = read_one_line_file("/proc/self/loginuid", &p);
1249         if (r == -EEXIST)
1250                 return 0;
1251         if (r < 0) {
1252                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1253                 return r;
1254         }
1255
1256         /* Already reset? */
1257         if (streq(p, "4294967295"))
1258                 return 0;
1259
1260         r = write_string_file("/proc/self/loginuid", "4294967295");
1261         if (r < 0) {
1262                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1263                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1264                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1265                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1266                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1267
1268                 sleep(5);
1269         }
1270
1271         return 0;
1272 }
1273
1274 static int setup_veth(int netns_fd) {
1275         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1276         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1277         char iface_name[IFNAMSIZ] = "ve-";
1278         int r;
1279
1280         if (!arg_private_network)
1281                 return 0;
1282
1283         if (!arg_network_veth)
1284                 return 0;
1285
1286         strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1287
1288         r = sd_rtnl_open(0, &rtnl);
1289         if (r < 0) {
1290                 log_error("Failed to connect to netlink: %s", strerror(-r));
1291                 return r;
1292         }
1293
1294         r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1295         if (r < 0) {
1296                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1297                 return r;
1298         }
1299
1300         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1301         if (r < 0) {
1302                 log_error("Failed to append netlink kind: %s", strerror(-r));
1303                 return r;
1304         }
1305
1306         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1307         if (r < 0) {
1308                 log_error("Failed to open netlink container: %s", strerror(-r));
1309                 return r;
1310         }
1311
1312         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1313         if (r < 0) {
1314                 log_error("Failed to append netlink kind: %s", strerror(-r));
1315                 return r;
1316         }
1317
1318         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1319         if (r < 0) {
1320                 log_error("Failed to open netlink container: %s", strerror(-r));
1321                 return r;
1322         }
1323
1324         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1325         if (r < 0) {
1326                 log_error("z Failed to open netlink container: %s", strerror(-r));
1327                 return r;
1328         }
1329
1330         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1331         if (r < 0) {
1332                 log_error("Failed to append netlink kind: %s", strerror(-r));
1333                 return r;
1334         }
1335
1336         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1337         if (r < 0) {
1338                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1339                 return r;
1340         }
1341
1342         r = sd_rtnl_message_close_container(m);
1343         if (r < 0) {
1344                 log_error("Failed to close netlink container: %s", strerror(-r));
1345                 return r;
1346         }
1347
1348         r = sd_rtnl_message_close_container(m);
1349         if (r < 0) {
1350                 log_error("Failed to close netlink container: %s", strerror(-r));
1351                 return r;
1352         }
1353
1354         r = sd_rtnl_message_close_container(m);
1355         if (r < 0) {
1356                 log_error("Failed to close netlink container: %s", strerror(-r));
1357                 return r;
1358         }
1359
1360         r = sd_rtnl_call(rtnl, m, 0, NULL);
1361         if (r < 0) {
1362                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1363                 return r;
1364         }
1365
1366         return 0;
1367 }
1368
1369 static int move_network_interfaces(pid_t pid) {
1370         _cleanup_udev_unref_ struct udev *udev = NULL;
1371         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1372         char **i;
1373         int r;
1374
1375         if (!arg_private_network)
1376                 return 0;
1377
1378         if (strv_isempty(arg_network_interfaces))
1379                 return 0;
1380
1381         r = sd_rtnl_open(0, &rtnl);
1382         if (r < 0) {
1383                 log_error("Failed to connect to netlink: %s", strerror(-r));
1384                 return r;
1385         }
1386
1387         udev = udev_new();
1388         if (!udev) {
1389                 log_error("Failed to connect to udev.");
1390                 return -ENOMEM;
1391         }
1392
1393         STRV_FOREACH(i, arg_network_interfaces) {
1394                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1395                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1396                 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1397                 int ifi;
1398
1399                 ifi = (int) if_nametoindex(*i);
1400                 if (ifi <= 0) {
1401                         log_error("Failed to resolve interface %s: %m", *i);
1402                         return -errno;
1403                 }
1404
1405                 sprintf(ifi_str, "n%i", ifi);
1406                 d = udev_device_new_from_device_id(udev, ifi_str);
1407                 if (!d) {
1408                         log_error("Failed to get udev device for interface %s: %m", *i);
1409                         return -errno;
1410                 }
1411
1412                 if (udev_device_get_is_initialized(d) <= 0) {
1413                         log_error("Network interface %s is not initialized yet.", *i);
1414                         return -EBUSY;
1415                 }
1416
1417                 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1418                 if (r < 0) {
1419                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1420                         return r;
1421                 }
1422
1423                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1424                 if (r < 0) {
1425                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1426                         return r;
1427                 }
1428
1429                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1430                 if (r < 0) {
1431                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1432                         return r;
1433                 }
1434         }
1435
1436         return 0;
1437 }
1438
1439 static int audit_still_doesnt_work_in_containers(void) {
1440
1441 #ifdef HAVE_SECCOMP
1442         scmp_filter_ctx seccomp;
1443         int r;
1444
1445         /*
1446            Audit is broken in containers, much of the userspace audit
1447            hookup will fail if running inside a container. We don't
1448            care and just turn off creation of audit sockets.
1449
1450            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1451            with EAFNOSUPPORT which audit userspace uses as indication
1452            that audit is disabled in the kernel.
1453          */
1454
1455         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1456         if (!seccomp)
1457                 return log_oom();
1458
1459         r = seccomp_rule_add_exact(
1460                         seccomp,
1461                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1462                         SCMP_SYS(socket),
1463                         2,
1464                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1465                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1466         if (r < 0) {
1467                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1468                 goto finish;
1469         }
1470
1471         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1472         if (r < 0) {
1473                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1474                 goto finish;
1475         }
1476
1477         r = seccomp_load(seccomp);
1478         if (r < 0)
1479                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1480
1481 finish:
1482         seccomp_release(seccomp);
1483         return r;
1484 #else
1485         return 0;
1486 #endif
1487
1488 }
1489
1490 int main(int argc, char *argv[]) {
1491
1492         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1493         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1494         _cleanup_free_ char *kdbus_domain = NULL;
1495         _cleanup_fdset_free_ FDSet *fds = NULL;
1496         const char *console = NULL;
1497         int r = EXIT_FAILURE, k;
1498         int n_fd_passed;
1499         pid_t pid = 0;
1500         sigset_t mask;
1501
1502         log_parse_environment();
1503         log_open();
1504
1505         k = parse_argv(argc, argv);
1506         if (k < 0)
1507                 goto finish;
1508         else if (k == 0) {
1509                 r = EXIT_SUCCESS;
1510                 goto finish;
1511         }
1512
1513         if (arg_directory) {
1514                 char *p;
1515
1516                 p = path_make_absolute_cwd(arg_directory);
1517                 free(arg_directory);
1518                 arg_directory = p;
1519         } else
1520                 arg_directory = get_current_dir_name();
1521
1522         if (!arg_directory) {
1523                 log_error("Failed to determine path, please use -D.");
1524                 goto finish;
1525         }
1526
1527         path_kill_slashes(arg_directory);
1528
1529         if (!arg_machine) {
1530                 arg_machine = strdup(basename(arg_directory));
1531                 if (!arg_machine) {
1532                         log_oom();
1533                         goto finish;
1534                 }
1535
1536                 hostname_cleanup(arg_machine, false);
1537                 if (isempty(arg_machine)) {
1538                         log_error("Failed to determine machine name automatically, please use -M.");
1539                         goto finish;
1540                 }
1541         }
1542
1543         if (geteuid() != 0) {
1544                 log_error("Need to be root.");
1545                 goto finish;
1546         }
1547
1548         if (sd_booted() <= 0) {
1549                 log_error("Not running on a systemd system.");
1550                 goto finish;
1551         }
1552
1553         if (path_equal(arg_directory, "/")) {
1554                 log_error("Spawning container on root directory not supported.");
1555                 goto finish;
1556         }
1557
1558         if (arg_boot) {
1559                 if (path_is_os_tree(arg_directory) <= 0) {
1560                         log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1561                         goto finish;
1562                 }
1563         } else {
1564                 const char *p;
1565
1566                 p = strappenda(arg_directory,
1567                                argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1568                 if (access(p, F_OK) < 0) {
1569                         log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1570                         goto finish;
1571
1572                 }
1573         }
1574
1575         log_close();
1576         n_fd_passed = sd_listen_fds(false);
1577         if (n_fd_passed > 0) {
1578                 k = fdset_new_listen_fds(&fds, false);
1579                 if (k < 0) {
1580                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1581                         goto finish;
1582                 }
1583         }
1584         fdset_close_others(fds);
1585         log_open();
1586
1587         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1588         if (master < 0) {
1589                 log_error("Failed to acquire pseudo tty: %m");
1590                 goto finish;
1591         }
1592
1593         console = ptsname(master);
1594         if (!console) {
1595                 log_error("Failed to determine tty name: %m");
1596                 goto finish;
1597         }
1598
1599         if (!arg_quiet)
1600                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1601
1602         if (unlockpt(master) < 0) {
1603                 log_error("Failed to unlock tty: %m");
1604                 goto finish;
1605         }
1606
1607         if (arg_network_veth) {
1608                 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1609                 if (netns_fd < 0) {
1610                         log_error("Failed to open network namespace fd: %m");
1611                         goto finish;
1612                 }
1613         }
1614
1615         if (access("/dev/kdbus/control", F_OK) >= 0) {
1616
1617                 if (arg_share_system) {
1618                         kdbus_domain = strdup("/dev/kdbus");
1619                         if (!kdbus_domain) {
1620                                 log_oom();
1621                                 goto finish;
1622                         }
1623                 } else {
1624                         const char *ns;
1625
1626                         ns = strappenda("machine-", arg_machine);
1627                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1628                         if (r < 0)
1629                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1630                         else
1631                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1632                 }
1633         }
1634
1635         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1636                 log_error("Failed to create kmsg socket pair: %m");
1637                 goto finish;
1638         }
1639
1640         sd_notify(0, "READY=1");
1641
1642         assert_se(sigemptyset(&mask) == 0);
1643         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1644         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1645
1646         for (;;) {
1647                 siginfo_t status;
1648
1649                 sync_fd = eventfd(0, EFD_CLOEXEC);
1650                 if (sync_fd < 0) {
1651                         log_error("Failed to create event fd: %m");
1652                         goto finish;
1653                 }
1654
1655                 pid = syscall(__NR_clone,
1656                               SIGCHLD|CLONE_NEWNS|
1657                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1658                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1659                 if (pid < 0) {
1660                         if (errno == EINVAL)
1661                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1662                         else
1663                                 log_error("clone() failed: %m");
1664
1665                         goto finish;
1666                 }
1667
1668                 if (pid == 0) {
1669                         /* child */
1670                         const char *home = NULL;
1671                         uid_t uid = (uid_t) -1;
1672                         gid_t gid = (gid_t) -1;
1673                         unsigned n_env = 2;
1674                         const char *envp[] = {
1675                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1676                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1677                                 NULL, /* TERM */
1678                                 NULL, /* HOME */
1679                                 NULL, /* USER */
1680                                 NULL, /* LOGNAME */
1681                                 NULL, /* container_uuid */
1682                                 NULL, /* LISTEN_FDS */
1683                                 NULL, /* LISTEN_PID */
1684                                 NULL
1685                         };
1686                         char **env_use;
1687                         eventfd_t x;
1688
1689                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1690                         if (envp[n_env])
1691                                 n_env ++;
1692
1693                         close_nointr_nofail(master);
1694                         master = -1;
1695
1696                         close_nointr(STDIN_FILENO);
1697                         close_nointr(STDOUT_FILENO);
1698                         close_nointr(STDERR_FILENO);
1699
1700                         close_nointr_nofail(kmsg_socket_pair[0]);
1701                         kmsg_socket_pair[0] = -1;
1702
1703                         reset_all_signal_handlers();
1704
1705                         assert_se(sigemptyset(&mask) == 0);
1706                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1707
1708                         k = open_terminal(console, O_RDWR);
1709                         if (k != STDIN_FILENO) {
1710                                 if (k >= 0) {
1711                                         close_nointr_nofail(k);
1712                                         k = -EINVAL;
1713                                 }
1714
1715                                 log_error("Failed to open console: %s", strerror(-k));
1716                                 goto child_fail;
1717                         }
1718
1719                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1720                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1721                                 log_error("Failed to duplicate console: %m");
1722                                 goto child_fail;
1723                         }
1724
1725                         if (setsid() < 0) {
1726                                 log_error("setsid() failed: %m");
1727                                 goto child_fail;
1728                         }
1729
1730                         if (reset_audit_loginuid() < 0)
1731                                 goto child_fail;
1732
1733                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1734                                 log_error("PR_SET_PDEATHSIG failed: %m");
1735                                 goto child_fail;
1736                         }
1737
1738                         /* Mark everything as slave, so that we still
1739                          * receive mounts from the real root, but don't
1740                          * propagate mounts to the real root. */
1741                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1742                                 log_error("MS_SLAVE|MS_REC failed: %m");
1743                                 goto child_fail;
1744                         }
1745
1746                         /* Turn directory into bind mount */
1747                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1748                                 log_error("Failed to make bind mount.");
1749                                 goto child_fail;
1750                         }
1751
1752                         if (arg_read_only)
1753                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1754                                         log_error("Failed to make read-only.");
1755                                         goto child_fail;
1756                                 }
1757
1758                         if (mount_all(arg_directory) < 0)
1759                                 goto child_fail;
1760
1761                         if (copy_devnodes(arg_directory) < 0)
1762                                 goto child_fail;
1763
1764                         if (setup_ptmx(arg_directory) < 0)
1765                                 goto child_fail;
1766
1767                         dev_setup(arg_directory);
1768
1769                         if (setup_veth(netns_fd) < 0)
1770                                 goto child_fail;
1771
1772                         if (netns_fd >= 0) {
1773                                 close_nointr_nofail(netns_fd);
1774                                 netns_fd = -1;
1775                         }
1776
1777                         if (audit_still_doesnt_work_in_containers() < 0)
1778                                 goto child_fail;
1779
1780                         if (setup_dev_console(arg_directory, console) < 0)
1781                                 goto child_fail;
1782
1783                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1784                                 goto child_fail;
1785
1786                         close_nointr_nofail(kmsg_socket_pair[1]);
1787                         kmsg_socket_pair[1] = -1;
1788
1789                         if (setup_boot_id(arg_directory) < 0)
1790                                 goto child_fail;
1791
1792                         if (setup_timezone(arg_directory) < 0)
1793                                 goto child_fail;
1794
1795                         if (setup_resolv_conf(arg_directory) < 0)
1796                                 goto child_fail;
1797
1798                         if (setup_journal(arg_directory) < 0)
1799                                 goto child_fail;
1800
1801                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1802                                 goto child_fail;
1803
1804                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1805                                 goto child_fail;
1806
1807                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1808                                 goto child_fail;
1809
1810                         if (chdir(arg_directory) < 0) {
1811                                 log_error("chdir(%s) failed: %m", arg_directory);
1812                                 goto child_fail;
1813                         }
1814
1815                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1816                                 log_error("mount(MS_MOVE) failed: %m");
1817                                 goto child_fail;
1818                         }
1819
1820                         if (chroot(".") < 0) {
1821                                 log_error("chroot() failed: %m");
1822                                 goto child_fail;
1823                         }
1824
1825                         if (chdir("/") < 0) {
1826                                 log_error("chdir() failed: %m");
1827                                 goto child_fail;
1828                         }
1829
1830                         umask(0022);
1831
1832                         if (arg_private_network)
1833                                 loopback_setup();
1834
1835                         if (drop_capabilities() < 0) {
1836                                 log_error("drop_capabilities() failed: %m");
1837                                 goto child_fail;
1838                         }
1839
1840                         if (arg_user) {
1841
1842                                 /* Note that this resolves user names
1843                                  * inside the container, and hence
1844                                  * accesses the NSS modules from the
1845                                  * container and not the host. This is
1846                                  * a bit weird... */
1847
1848                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1849                                         log_error("get_user_creds() failed: %m");
1850                                         goto child_fail;
1851                                 }
1852
1853                                 if (mkdir_parents_label(home, 0775) < 0) {
1854                                         log_error("mkdir_parents_label() failed: %m");
1855                                         goto child_fail;
1856                                 }
1857
1858                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1859                                         log_error("mkdir_safe_label() failed: %m");
1860                                         goto child_fail;
1861                                 }
1862
1863                                 if (initgroups((const char*)arg_user, gid) < 0) {
1864                                         log_error("initgroups() failed: %m");
1865                                         goto child_fail;
1866                                 }
1867
1868                                 if (setresgid(gid, gid, gid) < 0) {
1869                                         log_error("setregid() failed: %m");
1870                                         goto child_fail;
1871                                 }
1872
1873                                 if (setresuid(uid, uid, uid) < 0) {
1874                                         log_error("setreuid() failed: %m");
1875                                         goto child_fail;
1876                                 }
1877                         } else {
1878                                 /* Reset everything fully to 0, just in case */
1879
1880                                 if (setgroups(0, NULL) < 0) {
1881                                         log_error("setgroups() failed: %m");
1882                                         goto child_fail;
1883                                 }
1884
1885                                 if (setresgid(0, 0, 0) < 0) {
1886                                         log_error("setregid() failed: %m");
1887                                         goto child_fail;
1888                                 }
1889
1890                                 if (setresuid(0, 0, 0) < 0) {
1891                                         log_error("setreuid() failed: %m");
1892                                         goto child_fail;
1893                                 }
1894                         }
1895
1896                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1897                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1898                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1899                                 log_oom();
1900                                 goto child_fail;
1901                         }
1902
1903                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1904                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1905                                         log_oom();
1906                                         goto child_fail;
1907                                 }
1908                         }
1909
1910                         if (fdset_size(fds) > 0) {
1911                                 k = fdset_cloexec(fds, false);
1912                                 if (k < 0) {
1913                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1914                                         goto child_fail;
1915                                 }
1916
1917                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1918                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1919                                         log_oom();
1920                                         goto child_fail;
1921                                 }
1922                         }
1923
1924                         setup_hostname();
1925
1926                         eventfd_read(sync_fd, &x);
1927                         close_nointr_nofail(sync_fd);
1928                         sync_fd = -1;
1929
1930                         if (!strv_isempty(arg_setenv)) {
1931                                 char **n;
1932
1933                                 n = strv_env_merge(2, envp, arg_setenv);
1934                                 if (!n) {
1935                                         log_oom();
1936                                         goto child_fail;
1937                                 }
1938
1939                                 env_use = n;
1940                         } else
1941                                 env_use = (char**) envp;
1942
1943 #ifdef HAVE_SELINUX
1944                         if (arg_selinux_context)
1945                                 if (setexeccon(arg_selinux_context) < 0)
1946                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1947 #endif
1948                         if (arg_boot) {
1949                                 char **a;
1950                                 size_t l;
1951
1952                                 /* Automatically search for the init system */
1953
1954                                 l = 1 + argc - optind;
1955                                 a = newa(char*, l + 1);
1956                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1957
1958                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1959                                 execve(a[0], a, env_use);
1960
1961                                 a[0] = (char*) "/lib/systemd/systemd";
1962                                 execve(a[0], a, env_use);
1963
1964                                 a[0] = (char*) "/sbin/init";
1965                                 execve(a[0], a, env_use);
1966                         } else if (argc > optind)
1967                                 execvpe(argv[optind], argv + optind, env_use);
1968                         else {
1969                                 chdir(home ? home : "/root");
1970                                 execle("/bin/bash", "-bash", NULL, env_use);
1971                                 execle("/bin/sh", "-sh", NULL, env_use);
1972                         }
1973
1974                         log_error("execv() failed: %m");
1975
1976                 child_fail:
1977                         _exit(EXIT_FAILURE);
1978                 }
1979
1980                 fdset_free(fds);
1981                 fds = NULL;
1982
1983                 r = register_machine(pid);
1984                 if (r < 0)
1985                         goto finish;
1986
1987                 r = move_network_interfaces(pid);
1988                 if (r < 0)
1989                         goto finish;
1990
1991                 eventfd_write(sync_fd, 1);
1992                 close_nointr_nofail(sync_fd);
1993                 sync_fd = -1;
1994
1995                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1996                 if (k < 0) {
1997                         r = EXIT_FAILURE;
1998                         break;
1999                 }
2000
2001                 if (!arg_quiet)
2002                         putc('\n', stdout);
2003
2004                 /* Kill if it is not dead yet anyway */
2005                 terminate_machine(pid);
2006
2007                 /* Redundant, but better safe than sorry */
2008                 kill(pid, SIGKILL);
2009
2010                 k = wait_for_terminate(pid, &status);
2011                 pid = 0;
2012
2013                 if (k < 0) {
2014                         r = EXIT_FAILURE;
2015                         break;
2016                 }
2017
2018                 if (status.si_code == CLD_EXITED) {
2019                         r = status.si_status;
2020                         if (status.si_status != 0) {
2021                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2022                                 break;
2023                         }
2024
2025                         if (!arg_quiet)
2026                                 log_debug("Container %s exited successfully.", arg_machine);
2027                         break;
2028                 } else if (status.si_code == CLD_KILLED &&
2029                            status.si_status == SIGINT) {
2030
2031                         if (!arg_quiet)
2032                                 log_info("Container %s has been shut down.", arg_machine);
2033                         r = 0;
2034                         break;
2035                 } else if (status.si_code == CLD_KILLED &&
2036                            status.si_status == SIGHUP) {
2037
2038                         if (!arg_quiet)
2039                                 log_info("Container %s is being rebooted.", arg_machine);
2040                         continue;
2041                 } else if (status.si_code == CLD_KILLED ||
2042                            status.si_code == CLD_DUMPED) {
2043
2044                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2045                         r = EXIT_FAILURE;
2046                         break;
2047                 } else {
2048                         log_error("Container %s failed due to unknown reason.", arg_machine);
2049                         r = EXIT_FAILURE;
2050                         break;
2051                 }
2052         }
2053
2054 finish:
2055         if (pid > 0)
2056                 kill(pid, SIGKILL);
2057
2058         free(arg_directory);
2059         free(arg_machine);
2060         free(arg_setenv);
2061         free(arg_network_interfaces);
2062
2063         return r;
2064 }