chiark / gitweb /
3a6d428cd5e682231079e1a8001d067e45a1ced8
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46 #include <linux/veth.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #include "sd-daemon.h"
57 #include "sd-bus.h"
58 #include "sd-id128.h"
59 #include "sd-rtnl.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "macro.h"
64 #include "audit.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "bus-kernel.h"
78 #include "env-util.h"
79 #include "def.h"
80 #include "rtnl-util.h"
81 #include "udev-util.h"
82
83 typedef enum LinkJournal {
84         LINK_NO,
85         LINK_AUTO,
86         LINK_HOST,
87         LINK_GUEST
88 } LinkJournal;
89
90 static char *arg_directory = NULL;
91 static char *arg_user = NULL;
92 static sd_id128_t arg_uuid = {};
93 static char *arg_machine = NULL;
94 static char *arg_selinux_context = NULL;
95 static char *arg_selinux_apifs_context = NULL;
96 static const char *arg_slice = NULL;
97 static bool arg_private_network = false;
98 static bool arg_read_only = false;
99 static bool arg_boot = false;
100 static LinkJournal arg_link_journal = LINK_AUTO;
101 static uint64_t arg_retain =
102         (1ULL << CAP_CHOWN) |
103         (1ULL << CAP_DAC_OVERRIDE) |
104         (1ULL << CAP_DAC_READ_SEARCH) |
105         (1ULL << CAP_FOWNER) |
106         (1ULL << CAP_FSETID) |
107         (1ULL << CAP_IPC_OWNER) |
108         (1ULL << CAP_KILL) |
109         (1ULL << CAP_LEASE) |
110         (1ULL << CAP_LINUX_IMMUTABLE) |
111         (1ULL << CAP_NET_BIND_SERVICE) |
112         (1ULL << CAP_NET_BROADCAST) |
113         (1ULL << CAP_NET_RAW) |
114         (1ULL << CAP_SETGID) |
115         (1ULL << CAP_SETFCAP) |
116         (1ULL << CAP_SETPCAP) |
117         (1ULL << CAP_SETUID) |
118         (1ULL << CAP_SYS_ADMIN) |
119         (1ULL << CAP_SYS_CHROOT) |
120         (1ULL << CAP_SYS_NICE) |
121         (1ULL << CAP_SYS_PTRACE) |
122         (1ULL << CAP_SYS_TTY_CONFIG) |
123         (1ULL << CAP_SYS_RESOURCE) |
124         (1ULL << CAP_SYS_BOOT) |
125         (1ULL << CAP_AUDIT_WRITE) |
126         (1ULL << CAP_AUDIT_CONTROL) |
127         (1ULL << CAP_MKNOD);
128 static char **arg_bind = NULL;
129 static char **arg_bind_ro = NULL;
130 static char **arg_setenv = NULL;
131 static bool arg_quiet = false;
132 static bool arg_share_system = false;
133 static bool arg_register = true;
134 static bool arg_keep_unit = false;
135 static char **arg_network_interfaces = NULL;
136 static bool arg_network_veth = false;
137
138 static int help(void) {
139
140         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
141                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
142                "  -h --help                 Show this help\n"
143                "     --version              Print version string\n"
144                "  -q --quiet                Do not show status information\n"
145                "  -D --directory=NAME       Root directory for the container\n"
146                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
147                "  -u --user=USER            Run the command under specified user or uid\n"
148                "  -M --machine=NAME         Set the machine name for the container\n"
149                "     --uuid=UUID            Set a specific machine UUID for the container\n"
150                "  -S --slice=SLICE          Place the container in the specified slice\n"
151                "     --private-network      Disable network in container\n"
152                "     --network-interface=INTERFACE\n"
153                "                            Assign an existing network interface to the\n"
154                "                            container\n"
155                "     --network-veth         Add a a virtual ethernet connection between host\n"
156                "                            and container\n"
157                "  -Z --selinux-context=SECLABEL\n"
158                "                            Set the SELinux security context to be used by\n"
159                "                            processes in the container\n"
160                "  -L --selinux-apifs-context=SECLABEL\n"
161                "                            Set the SELinux security context to be used by\n"
162                "                            API/tmpfs file systems in the container\n"
163                "     --capability=CAP       In addition to the default, retain specified\n"
164                "                            capability\n"
165                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
166                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
167                "  -j                        Equivalent to --link-journal=host\n"
168                "     --read-only            Mount the root directory read-only\n"
169                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
170                "                            the container\n"
171                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
172                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
173                "     --share-system         Share system namespaces with host\n"
174                "     --register=BOOLEAN     Register container as machine\n"
175                "     --keep-unit            Do not register a scope for the machine, reuse\n"
176                "                            the service unit nspawn is running in\n",
177                program_invocation_short_name);
178
179         return 0;
180 }
181
182 static int parse_argv(int argc, char *argv[]) {
183
184         enum {
185                 ARG_VERSION = 0x100,
186                 ARG_PRIVATE_NETWORK,
187                 ARG_UUID,
188                 ARG_READ_ONLY,
189                 ARG_CAPABILITY,
190                 ARG_DROP_CAPABILITY,
191                 ARG_LINK_JOURNAL,
192                 ARG_BIND,
193                 ARG_BIND_RO,
194                 ARG_SETENV,
195                 ARG_SHARE_SYSTEM,
196                 ARG_REGISTER,
197                 ARG_KEEP_UNIT,
198                 ARG_NETWORK_INTERFACE,
199                 ARG_NETWORK_VETH,
200         };
201
202         static const struct option options[] = {
203                 { "help",                  no_argument,       NULL, 'h'                   },
204                 { "version",               no_argument,       NULL, ARG_VERSION           },
205                 { "directory",             required_argument, NULL, 'D'                   },
206                 { "user",                  required_argument, NULL, 'u'                   },
207                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
208                 { "boot",                  no_argument,       NULL, 'b'                   },
209                 { "uuid",                  required_argument, NULL, ARG_UUID              },
210                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
211                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
212                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
213                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
214                 { "bind",                  required_argument, NULL, ARG_BIND              },
215                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
216                 { "machine",               required_argument, NULL, 'M'                   },
217                 { "slice",                 required_argument, NULL, 'S'                   },
218                 { "setenv",                required_argument, NULL, ARG_SETENV            },
219                 { "selinux-context",       required_argument, NULL, 'Z'                   },
220                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
221                 { "quiet",                 no_argument,       NULL, 'q'                   },
222                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
223                 { "register",              required_argument, NULL, ARG_REGISTER          },
224                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
225                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
226                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH   },
227                 {}
228         };
229
230         int c, r;
231         uint64_t plus = 0, minus = 0;
232
233         assert(argc >= 0);
234         assert(argv);
235
236         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
237
238                 switch (c) {
239
240                 case 'h':
241                         return help();
242
243                 case ARG_VERSION:
244                         puts(PACKAGE_STRING);
245                         puts(SYSTEMD_FEATURES);
246                         return 0;
247
248                 case 'D':
249                         free(arg_directory);
250                         arg_directory = canonicalize_file_name(optarg);
251                         if (!arg_directory) {
252                                 log_error("Invalid root directory: %m");
253                                 return -ENOMEM;
254                         }
255
256                         break;
257
258                 case 'u':
259                         free(arg_user);
260                         arg_user = strdup(optarg);
261                         if (!arg_user)
262                                 return log_oom();
263
264                         break;
265
266                 case ARG_NETWORK_VETH:
267                         arg_network_veth = true;
268                         arg_private_network = true;
269                         break;
270
271                 case ARG_NETWORK_INTERFACE:
272                         if (strv_push(&arg_network_interfaces, optarg) < 0)
273                                 return log_oom();
274
275                         /* fall through */
276
277                 case ARG_PRIVATE_NETWORK:
278                         arg_private_network = true;
279                         break;
280
281                 case 'b':
282                         arg_boot = true;
283                         break;
284
285                 case ARG_UUID:
286                         r = sd_id128_from_string(optarg, &arg_uuid);
287                         if (r < 0) {
288                                 log_error("Invalid UUID: %s", optarg);
289                                 return r;
290                         }
291                         break;
292
293                 case 'S':
294                         arg_slice = strdup(optarg);
295                         if (!arg_slice)
296                                 return log_oom();
297
298                         break;
299
300                 case 'M':
301                         if (isempty(optarg)) {
302                                 free(arg_machine);
303                                 arg_machine = NULL;
304                         } else {
305
306                                 if (!hostname_is_valid(optarg)) {
307                                         log_error("Invalid machine name: %s", optarg);
308                                         return -EINVAL;
309                                 }
310
311                                 free(arg_machine);
312                                 arg_machine = strdup(optarg);
313                                 if (!arg_machine)
314                                         return log_oom();
315
316                                 break;
317                         }
318
319                 case 'Z':
320                         arg_selinux_context = optarg;
321                         break;
322
323                 case 'L':
324                         arg_selinux_apifs_context = optarg;
325                         break;
326
327                 case ARG_READ_ONLY:
328                         arg_read_only = true;
329                         break;
330
331                 case ARG_CAPABILITY:
332                 case ARG_DROP_CAPABILITY: {
333                         char *state, *word;
334                         size_t length;
335
336                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
337                                 _cleanup_free_ char *t;
338                                 cap_value_t cap;
339
340                                 t = strndup(word, length);
341                                 if (!t)
342                                         return log_oom();
343
344                                 if (streq(t, "all")) {
345                                         if (c == ARG_CAPABILITY)
346                                                 plus = (uint64_t) -1;
347                                         else
348                                                 minus = (uint64_t) -1;
349                                 } else {
350                                         if (cap_from_name(t, &cap) < 0) {
351                                                 log_error("Failed to parse capability %s.", t);
352                                                 return -EINVAL;
353                                         }
354
355                                         if (c == ARG_CAPABILITY)
356                                                 plus |= 1ULL << (uint64_t) cap;
357                                         else
358                                                 minus |= 1ULL << (uint64_t) cap;
359                                 }
360                         }
361
362                         break;
363                 }
364
365                 case 'j':
366                         arg_link_journal = LINK_GUEST;
367                         break;
368
369                 case ARG_LINK_JOURNAL:
370                         if (streq(optarg, "auto"))
371                                 arg_link_journal = LINK_AUTO;
372                         else if (streq(optarg, "no"))
373                                 arg_link_journal = LINK_NO;
374                         else if (streq(optarg, "guest"))
375                                 arg_link_journal = LINK_GUEST;
376                         else if (streq(optarg, "host"))
377                                 arg_link_journal = LINK_HOST;
378                         else {
379                                 log_error("Failed to parse link journal mode %s", optarg);
380                                 return -EINVAL;
381                         }
382
383                         break;
384
385                 case ARG_BIND:
386                 case ARG_BIND_RO: {
387                         _cleanup_free_ char *a = NULL, *b = NULL;
388                         char *e;
389                         char ***x;
390
391                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
392
393                         e = strchr(optarg, ':');
394                         if (e) {
395                                 a = strndup(optarg, e - optarg);
396                                 b = strdup(e + 1);
397                         } else {
398                                 a = strdup(optarg);
399                                 b = strdup(optarg);
400                         }
401
402                         if (!a || !b)
403                                 return log_oom();
404
405                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
406                                 log_error("Invalid bind mount specification: %s", optarg);
407                                 return -EINVAL;
408                         }
409
410                         r = strv_extend(x, a);
411                         if (r < 0)
412                                 return log_oom();
413
414                         r = strv_extend(x, b);
415                         if (r < 0)
416                                 return log_oom();
417
418                         break;
419                 }
420
421                 case ARG_SETENV: {
422                         char **n;
423
424                         if (!env_assignment_is_valid(optarg)) {
425                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
426                                 return -EINVAL;
427                         }
428
429                         n = strv_env_set(arg_setenv, optarg);
430                         if (!n)
431                                 return log_oom();
432
433                         strv_free(arg_setenv);
434                         arg_setenv = n;
435                         break;
436                 }
437
438                 case 'q':
439                         arg_quiet = true;
440                         break;
441
442                 case ARG_SHARE_SYSTEM:
443                         arg_share_system = true;
444                         break;
445
446                 case ARG_REGISTER:
447                         r = parse_boolean(optarg);
448                         if (r < 0) {
449                                 log_error("Failed to parse --register= argument: %s", optarg);
450                                 return r;
451                         }
452
453                         arg_register = r;
454                         break;
455
456                 case ARG_KEEP_UNIT:
457                         arg_keep_unit = true;
458                         break;
459
460                 case '?':
461                         return -EINVAL;
462
463                 default:
464                         assert_not_reached("Unhandled option");
465                 }
466         }
467
468         if (arg_share_system)
469                 arg_register = false;
470
471         if (arg_boot && arg_share_system) {
472                 log_error("--boot and --share-system may not be combined.");
473                 return -EINVAL;
474         }
475
476         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
477                 log_error("--keep-unit may not be used when invoked from a user session.");
478                 return -EINVAL;
479         }
480
481         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
482
483         return 1;
484 }
485
486 static int mount_all(const char *dest) {
487
488         typedef struct MountPoint {
489                 const char *what;
490                 const char *where;
491                 const char *type;
492                 const char *options;
493                 unsigned long flags;
494                 bool fatal;
495         } MountPoint;
496
497         static const MountPoint mount_table[] = {
498                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
499                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
500                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
501                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
502                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
503                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
504                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
505                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
506 #ifdef HAVE_SELINUX
507                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
508                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
509 #endif
510         };
511
512         unsigned k;
513         int r = 0;
514
515         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
516                 _cleanup_free_ char *where = NULL;
517 #ifdef HAVE_SELINUX
518                 _cleanup_free_ char *options = NULL;
519 #endif
520                 const char *o;
521                 int t;
522
523                 where = strjoin(dest, "/", mount_table[k].where, NULL);
524                 if (!where)
525                         return log_oom();
526
527                 t = path_is_mount_point(where, true);
528                 if (t < 0) {
529                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
530
531                         if (r == 0)
532                                 r = t;
533
534                         continue;
535                 }
536
537                 /* Skip this entry if it is not a remount. */
538                 if (mount_table[k].what && t > 0)
539                         continue;
540
541                 mkdir_p(where, 0755);
542
543 #ifdef HAVE_SELINUX
544                 if (arg_selinux_apifs_context &&
545                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
546                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
547                         if (!options)
548                                 return log_oom();
549
550                         o = options;
551                 } else
552 #endif
553                         o = mount_table[k].options;
554
555
556                 if (mount(mount_table[k].what,
557                           where,
558                           mount_table[k].type,
559                           mount_table[k].flags,
560                           o) < 0 &&
561                     mount_table[k].fatal) {
562
563                         log_error("mount(%s) failed: %m", where);
564
565                         if (r == 0)
566                                 r = -errno;
567                 }
568         }
569
570         return r;
571 }
572
573 static int mount_binds(const char *dest, char **l, unsigned long flags) {
574         char **x, **y;
575
576         STRV_FOREACH_PAIR(x, y, l) {
577                 char *where;
578                 struct stat source_st, dest_st;
579                 int r;
580
581                 if (stat(*x, &source_st) < 0) {
582                         log_error("failed to stat %s: %m", *x);
583                         return -errno;
584                 }
585
586                 where = strappenda(dest, *y);
587                 r = stat(where, &dest_st);
588                 if (r == 0) {
589                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
590                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
591                                                 *x, where);
592                                 return -EINVAL;
593                         }
594                 } else if (errno == ENOENT) {
595                         r = mkdir_parents_label(where, 0755);
596                         if (r < 0) {
597                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
598                                 return r;
599                         }
600                 } else {
601                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
602                         return -errno;
603                 }
604                 /* Create the mount point, but be conservative -- refuse to create block
605                 * and char devices. */
606                 if (S_ISDIR(source_st.st_mode))
607                         mkdir_label(where, 0755);
608                 else if (S_ISFIFO(source_st.st_mode))
609                         mkfifo(where, 0644);
610                 else if (S_ISSOCK(source_st.st_mode))
611                         mknod(where, 0644 | S_IFSOCK, 0);
612                 else if (S_ISREG(source_st.st_mode))
613                         touch(where);
614                 else {
615                         log_error("Refusing to create mountpoint for file: %s", *x);
616                         return -ENOTSUP;
617                 }
618
619                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
620                         log_error("mount(%s) failed: %m", where);
621                         return -errno;
622                 }
623
624                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
625                         log_error("mount(%s) failed: %m", where);
626                         return -errno;
627                 }
628         }
629
630         return 0;
631 }
632
633 static int setup_timezone(const char *dest) {
634         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
635         char *z, *y;
636         int r;
637
638         assert(dest);
639
640         /* Fix the timezone, if possible */
641         r = readlink_malloc("/etc/localtime", &p);
642         if (r < 0) {
643                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
644                 return 0;
645         }
646
647         z = path_startswith(p, "../usr/share/zoneinfo/");
648         if (!z)
649                 z = path_startswith(p, "/usr/share/zoneinfo/");
650         if (!z) {
651                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
652                 return 0;
653         }
654
655         where = strappend(dest, "/etc/localtime");
656         if (!where)
657                 return log_oom();
658
659         r = readlink_malloc(where, &q);
660         if (r >= 0) {
661                 y = path_startswith(q, "../usr/share/zoneinfo/");
662                 if (!y)
663                         y = path_startswith(q, "/usr/share/zoneinfo/");
664
665
666                 /* Already pointing to the right place? Then do nothing .. */
667                 if (y && streq(y, z))
668                         return 0;
669         }
670
671         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
672         if (!check)
673                 return log_oom();
674
675         if (access(check, F_OK) < 0) {
676                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
677                 return 0;
678         }
679
680         what = strappend("../usr/share/zoneinfo/", z);
681         if (!what)
682                 return log_oom();
683
684         unlink(where);
685         if (symlink(what, where) < 0) {
686                 log_error("Failed to correct timezone of container: %m");
687                 return 0;
688         }
689
690         return 0;
691 }
692
693 static int setup_resolv_conf(const char *dest) {
694         char _cleanup_free_ *where = NULL;
695
696         assert(dest);
697
698         if (arg_private_network)
699                 return 0;
700
701         /* Fix resolv.conf, if possible */
702         where = strappend(dest, "/etc/resolv.conf");
703         if (!where)
704                 return log_oom();
705
706         /* We don't really care for the results of this really. If it
707          * fails, it fails, but meh... */
708         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
709
710         return 0;
711 }
712
713 static int setup_boot_id(const char *dest) {
714         _cleanup_free_ char *from = NULL, *to = NULL;
715         sd_id128_t rnd;
716         char as_uuid[37];
717         int r;
718
719         assert(dest);
720
721         if (arg_share_system)
722                 return 0;
723
724         /* Generate a new randomized boot ID, so that each boot-up of
725          * the container gets a new one */
726
727         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
728         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
729         if (!from || !to)
730                 return log_oom();
731
732         r = sd_id128_randomize(&rnd);
733         if (r < 0) {
734                 log_error("Failed to generate random boot id: %s", strerror(-r));
735                 return r;
736         }
737
738         snprintf(as_uuid, sizeof(as_uuid),
739                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
740                  SD_ID128_FORMAT_VAL(rnd));
741         char_array_0(as_uuid);
742
743         r = write_string_file(from, as_uuid);
744         if (r < 0) {
745                 log_error("Failed to write boot id: %s", strerror(-r));
746                 return r;
747         }
748
749         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
750                 log_error("Failed to bind mount boot id: %m");
751                 r = -errno;
752         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
753                 log_warning("Failed to make boot id read-only: %m");
754
755         unlink(from);
756         return r;
757 }
758
759 static int copy_devnodes(const char *dest) {
760
761         static const char devnodes[] =
762                 "null\0"
763                 "zero\0"
764                 "full\0"
765                 "random\0"
766                 "urandom\0"
767                 "tty\0";
768
769         const char *d;
770         int r = 0;
771         _cleanup_umask_ mode_t u;
772
773         assert(dest);
774
775         u = umask(0000);
776
777         NULSTR_FOREACH(d, devnodes) {
778                 _cleanup_free_ char *from = NULL, *to = NULL;
779                 struct stat st;
780
781                 from = strappend("/dev/", d);
782                 to = strjoin(dest, "/dev/", d, NULL);
783                 if (!from || !to)
784                         return log_oom();
785
786                 if (stat(from, &st) < 0) {
787
788                         if (errno != ENOENT) {
789                                 log_error("Failed to stat %s: %m", from);
790                                 return -errno;
791                         }
792
793                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
794
795                         log_error("%s is not a char or block device, cannot copy", from);
796                         return -EIO;
797
798                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
799
800                         log_error("mknod(%s) failed: %m", dest);
801                         return  -errno;
802                 }
803         }
804
805         return r;
806 }
807
808 static int setup_ptmx(const char *dest) {
809         _cleanup_free_ char *p = NULL;
810
811         p = strappend(dest, "/dev/ptmx");
812         if (!p)
813                 return log_oom();
814
815         if (symlink("pts/ptmx", p) < 0) {
816                 log_error("Failed to create /dev/ptmx symlink: %m");
817                 return -errno;
818         }
819
820         return 0;
821 }
822
823 static int setup_dev_console(const char *dest, const char *console) {
824         struct stat st;
825         _cleanup_free_ char *to = NULL;
826         int r;
827         _cleanup_umask_ mode_t u;
828
829         assert(dest);
830         assert(console);
831
832         u = umask(0000);
833
834         if (stat(console, &st) < 0) {
835                 log_error("Failed to stat %s: %m", console);
836                 return -errno;
837
838         } else if (!S_ISCHR(st.st_mode)) {
839                 log_error("/dev/console is not a char device");
840                 return -EIO;
841         }
842
843         r = chmod_and_chown(console, 0600, 0, 0);
844         if (r < 0) {
845                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
846                 return r;
847         }
848
849         if (asprintf(&to, "%s/dev/console", dest) < 0)
850                 return log_oom();
851
852         /* We need to bind mount the right tty to /dev/console since
853          * ptys can only exist on pts file systems. To have something
854          * to bind mount things on we create a device node first, that
855          * has the right major/minor (note that the major minor
856          * doesn't actually matter here, since we mount it over
857          * anyway). */
858
859         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
860                 log_error("mknod() for /dev/console failed: %m");
861                 return -errno;
862         }
863
864         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
865                 log_error("Bind mount for /dev/console failed: %m");
866                 return -errno;
867         }
868
869         return 0;
870 }
871
872 static int setup_kmsg(const char *dest, int kmsg_socket) {
873         _cleanup_free_ char *from = NULL, *to = NULL;
874         int r, fd, k;
875         _cleanup_umask_ mode_t u;
876         union {
877                 struct cmsghdr cmsghdr;
878                 uint8_t buf[CMSG_SPACE(sizeof(int))];
879         } control = {};
880         struct msghdr mh = {
881                 .msg_control = &control,
882                 .msg_controllen = sizeof(control),
883         };
884         struct cmsghdr *cmsg;
885
886         assert(dest);
887         assert(kmsg_socket >= 0);
888
889         u = umask(0000);
890
891         /* We create the kmsg FIFO as /dev/kmsg, but immediately
892          * delete it after bind mounting it to /proc/kmsg. While FIFOs
893          * on the reading side behave very similar to /proc/kmsg,
894          * their writing side behaves differently from /dev/kmsg in
895          * that writing blocks when nothing is reading. In order to
896          * avoid any problems with containers deadlocking due to this
897          * we simply make /dev/kmsg unavailable to the container. */
898         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
899             asprintf(&to, "%s/proc/kmsg", dest) < 0)
900                 return log_oom();
901
902         if (mkfifo(from, 0600) < 0) {
903                 log_error("mkfifo() for /dev/kmsg failed: %m");
904                 return -errno;
905         }
906
907         r = chmod_and_chown(from, 0600, 0, 0);
908         if (r < 0) {
909                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
910                 return r;
911         }
912
913         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
914                 log_error("Bind mount for /proc/kmsg failed: %m");
915                 return -errno;
916         }
917
918         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
919         if (fd < 0) {
920                 log_error("Failed to open fifo: %m");
921                 return -errno;
922         }
923
924         cmsg = CMSG_FIRSTHDR(&mh);
925         cmsg->cmsg_level = SOL_SOCKET;
926         cmsg->cmsg_type = SCM_RIGHTS;
927         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
928         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
929
930         mh.msg_controllen = cmsg->cmsg_len;
931
932         /* Store away the fd in the socket, so that it stays open as
933          * long as we run the child */
934         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
935         close_nointr_nofail(fd);
936
937         if (k < 0) {
938                 log_error("Failed to send FIFO fd: %m");
939                 return -errno;
940         }
941
942         /* And now make the FIFO unavailable as /dev/kmsg... */
943         unlink(from);
944         return 0;
945 }
946
947 static int setup_hostname(void) {
948
949         if (arg_share_system)
950                 return 0;
951
952         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
953                 return -errno;
954
955         return 0;
956 }
957
958 static int setup_journal(const char *directory) {
959         sd_id128_t machine_id, this_id;
960         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
961         char *id;
962         int r;
963
964         p = strappend(directory, "/etc/machine-id");
965         if (!p)
966                 return log_oom();
967
968         r = read_one_line_file(p, &b);
969         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
970                 return 0;
971         else if (r < 0) {
972                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
973                 return r;
974         }
975
976         id = strstrip(b);
977         if (isempty(id) && arg_link_journal == LINK_AUTO)
978                 return 0;
979
980         /* Verify validity */
981         r = sd_id128_from_string(id, &machine_id);
982         if (r < 0) {
983                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
984                 return r;
985         }
986
987         r = sd_id128_get_machine(&this_id);
988         if (r < 0) {
989                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
990                 return r;
991         }
992
993         if (sd_id128_equal(machine_id, this_id)) {
994                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
995                          "Host and machine ids are equal (%s): refusing to link journals", id);
996                 if (arg_link_journal == LINK_AUTO)
997                         return 0;
998                 return
999                         -EEXIST;
1000         }
1001
1002         if (arg_link_journal == LINK_NO)
1003                 return 0;
1004
1005         free(p);
1006         p = strappend("/var/log/journal/", id);
1007         q = strjoin(directory, "/var/log/journal/", id, NULL);
1008         if (!p || !q)
1009                 return log_oom();
1010
1011         if (path_is_mount_point(p, false) > 0) {
1012                 if (arg_link_journal != LINK_AUTO) {
1013                         log_error("%s: already a mount point, refusing to use for journal", p);
1014                         return -EEXIST;
1015                 }
1016
1017                 return 0;
1018         }
1019
1020         if (path_is_mount_point(q, false) > 0) {
1021                 if (arg_link_journal != LINK_AUTO) {
1022                         log_error("%s: already a mount point, refusing to use for journal", q);
1023                         return -EEXIST;
1024                 }
1025
1026                 return 0;
1027         }
1028
1029         r = readlink_and_make_absolute(p, &d);
1030         if (r >= 0) {
1031                 if ((arg_link_journal == LINK_GUEST ||
1032                      arg_link_journal == LINK_AUTO) &&
1033                     path_equal(d, q)) {
1034
1035                         r = mkdir_p(q, 0755);
1036                         if (r < 0)
1037                                 log_warning("failed to create directory %s: %m", q);
1038                         return 0;
1039                 }
1040
1041                 if (unlink(p) < 0) {
1042                         log_error("Failed to remove symlink %s: %m", p);
1043                         return -errno;
1044                 }
1045         } else if (r == -EINVAL) {
1046
1047                 if (arg_link_journal == LINK_GUEST &&
1048                     rmdir(p) < 0) {
1049
1050                         if (errno == ENOTDIR) {
1051                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1052                                 return r;
1053                         } else {
1054                                 log_error("Failed to remove %s: %m", p);
1055                                 return -errno;
1056                         }
1057                 }
1058         } else if (r != -ENOENT) {
1059                 log_error("readlink(%s) failed: %m", p);
1060                 return r;
1061         }
1062
1063         if (arg_link_journal == LINK_GUEST) {
1064
1065                 if (symlink(q, p) < 0) {
1066                         log_error("Failed to symlink %s to %s: %m", q, p);
1067                         return -errno;
1068                 }
1069
1070                 r = mkdir_p(q, 0755);
1071                 if (r < 0)
1072                         log_warning("failed to create directory %s: %m", q);
1073                 return 0;
1074         }
1075
1076         if (arg_link_journal == LINK_HOST) {
1077                 r = mkdir_p(p, 0755);
1078                 if (r < 0) {
1079                         log_error("Failed to create %s: %m", p);
1080                         return r;
1081                 }
1082
1083         } else if (access(p, F_OK) < 0)
1084                 return 0;
1085
1086         if (dir_is_empty(q) == 0) {
1087                 log_error("%s not empty.", q);
1088                 return -ENOTEMPTY;
1089         }
1090
1091         r = mkdir_p(q, 0755);
1092         if (r < 0) {
1093                 log_error("Failed to create %s: %m", q);
1094                 return r;
1095         }
1096
1097         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1098                 log_error("Failed to bind mount journal from host into guest: %m");
1099                 return -errno;
1100         }
1101
1102         return 0;
1103 }
1104
1105 static int setup_kdbus(const char *dest, const char *path) {
1106         const char *p;
1107
1108         if (!path)
1109                 return 0;
1110
1111         p = strappenda(dest, "/dev/kdbus");
1112         if (mkdir(p, 0755) < 0) {
1113                 log_error("Failed to create kdbus path: %m");
1114                 return  -errno;
1115         }
1116
1117         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1118                 log_error("Failed to mount kdbus domain path: %m");
1119                 return -errno;
1120         }
1121
1122         return 0;
1123 }
1124
1125 static int drop_capabilities(void) {
1126         return capability_bounding_set_drop(~arg_retain, false);
1127 }
1128
1129 static int register_machine(pid_t pid) {
1130         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1131         _cleanup_bus_unref_ sd_bus *bus = NULL;
1132         int r;
1133
1134         if (!arg_register)
1135                 return 0;
1136
1137         r = sd_bus_default_system(&bus);
1138         if (r < 0) {
1139                 log_error("Failed to open system bus: %s", strerror(-r));
1140                 return r;
1141         }
1142
1143         if (arg_keep_unit) {
1144                 r = sd_bus_call_method(
1145                                 bus,
1146                                 "org.freedesktop.machine1",
1147                                 "/org/freedesktop/machine1",
1148                                 "org.freedesktop.machine1.Manager",
1149                                 "RegisterMachine",
1150                                 &error,
1151                                 NULL,
1152                                 "sayssus",
1153                                 arg_machine,
1154                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1155                                 "nspawn",
1156                                 "container",
1157                                 (uint32_t) pid,
1158                                 strempty(arg_directory));
1159         } else {
1160                 r = sd_bus_call_method(
1161                                 bus,
1162                                 "org.freedesktop.machine1",
1163                                 "/org/freedesktop/machine1",
1164                                 "org.freedesktop.machine1.Manager",
1165                                 "CreateMachine",
1166                                 &error,
1167                                 NULL,
1168                                 "sayssusa(sv)",
1169                                 arg_machine,
1170                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1171                                 "nspawn",
1172                                 "container",
1173                                 (uint32_t) pid,
1174                                 strempty(arg_directory),
1175                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1176         }
1177
1178         if (r < 0) {
1179                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1180                 return r;
1181         }
1182
1183         return 0;
1184 }
1185
1186 static int terminate_machine(pid_t pid) {
1187         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1188         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1189         _cleanup_bus_unref_ sd_bus *bus = NULL;
1190         const char *path;
1191         int r;
1192
1193         if (!arg_register)
1194                 return 0;
1195
1196         r = sd_bus_default_system(&bus);
1197         if (r < 0) {
1198                 log_error("Failed to open system bus: %s", strerror(-r));
1199                 return r;
1200         }
1201
1202         r = sd_bus_call_method(
1203                         bus,
1204                         "org.freedesktop.machine1",
1205                         "/org/freedesktop/machine1",
1206                         "org.freedesktop.machine1.Manager",
1207                         "GetMachineByPID",
1208                         &error,
1209                         &reply,
1210                         "u",
1211                         (uint32_t) pid);
1212         if (r < 0) {
1213                 /* Note that the machine might already have been
1214                  * cleaned up automatically, hence don't consider it a
1215                  * failure if we cannot get the machine object. */
1216                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1217                 return 0;
1218         }
1219
1220         r = sd_bus_message_read(reply, "o", &path);
1221         if (r < 0)
1222                 return bus_log_parse_error(r);
1223
1224         r = sd_bus_call_method(
1225                         bus,
1226                         "org.freedesktop.machine1",
1227                         path,
1228                         "org.freedesktop.machine1.Machine",
1229                         "Terminate",
1230                         &error,
1231                         NULL,
1232                         NULL);
1233         if (r < 0) {
1234                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1235                 return 0;
1236         }
1237
1238         return 0;
1239 }
1240
1241 static int reset_audit_loginuid(void) {
1242         _cleanup_free_ char *p = NULL;
1243         int r;
1244
1245         if (arg_share_system)
1246                 return 0;
1247
1248         r = read_one_line_file("/proc/self/loginuid", &p);
1249         if (r == -EEXIST)
1250                 return 0;
1251         if (r < 0) {
1252                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1253                 return r;
1254         }
1255
1256         /* Already reset? */
1257         if (streq(p, "4294967295"))
1258                 return 0;
1259
1260         r = write_string_file("/proc/self/loginuid", "4294967295");
1261         if (r < 0) {
1262                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1263                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1264                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1265                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1266                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1267
1268                 sleep(5);
1269         }
1270
1271         return 0;
1272 }
1273
1274 static int setup_veth(int netns_fd) {
1275         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1276         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1277         char iface_name[IFNAMSIZ] = "ve-";
1278         int r;
1279
1280         if (!arg_private_network)
1281                 return 0;
1282
1283         if (!arg_network_veth)
1284                 return 0;
1285
1286         strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1287
1288         r = sd_rtnl_open(0, &rtnl);
1289         if (r < 0) {
1290                 log_error("Failed to connect to netlink: %s", strerror(-r));
1291                 return r;
1292         }
1293
1294         r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1295         if (r < 0) {
1296                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1297                 return r;
1298         }
1299
1300         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1301         if (r < 0) {
1302                 log_error("Failed to append netlink kind: %s", strerror(-r));
1303                 return r;
1304         }
1305
1306         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO, 0);
1307         if (r < 0) {
1308                 log_error("Failed to open netlink container: %s", strerror(-r));
1309                 return r;
1310         }
1311
1312         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1313         if (r < 0) {
1314                 log_error("Failed to append netlink kind: %s", strerror(-r));
1315                 return r;
1316         }
1317
1318         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA, 0);
1319         if (r < 0) {
1320                 log_error("Failed to open netlink container: %s", strerror(-r));
1321                 return r;
1322         }
1323
1324         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER, sizeof(struct ifinfomsg));
1325         if (r < 0) {
1326                 log_error("z Failed to open netlink container: %s", strerror(-r));
1327                 return r;
1328         }
1329
1330         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1331         if (r < 0) {
1332                 log_error("Failed to append netlink kind: %s", strerror(-r));
1333                 return r;
1334         }
1335
1336         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1337         if (r < 0) {
1338                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1339                 return r;
1340         }
1341
1342         r = sd_rtnl_message_close_container(m);
1343         if (r < 0) {
1344                 log_error("Failed to close netlink container: %s", strerror(-r));
1345                 return r;
1346         }
1347
1348         r = sd_rtnl_message_close_container(m);
1349         if (r < 0) {
1350                 log_error("Failed to close netlink container: %s", strerror(-r));
1351                 return r;
1352         }
1353
1354         r = sd_rtnl_message_close_container(m);
1355         if (r < 0) {
1356                 log_error("Failed to close netlink container: %s", strerror(-r));
1357                 return r;
1358         }
1359
1360         r = sd_rtnl_call(rtnl, m, 0, NULL);
1361         if (r < 0) {
1362                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1363                 return r;
1364         }
1365
1366         return 0;
1367 }
1368
1369 static int move_network_interfaces(pid_t pid) {
1370         _cleanup_udev_unref_ struct udev *udev = NULL;
1371         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1372         char **i;
1373         int r;
1374
1375         if (!arg_private_network)
1376                 return 0;
1377
1378         if (strv_isempty(arg_network_interfaces))
1379                 return 0;
1380
1381         r = sd_rtnl_open(0, &rtnl);
1382         if (r < 0) {
1383                 log_error("Failed to connect to netlink: %s", strerror(-r));
1384                 return r;
1385         }
1386
1387         udev = udev_new();
1388         if (!udev) {
1389                 log_error("Failed to connect to udev.");
1390                 return -ENOMEM;
1391         }
1392
1393         STRV_FOREACH(i, arg_network_interfaces) {
1394                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1395                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1396                 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1397                 int ifi;
1398
1399                 ifi = (int) if_nametoindex(*i);
1400                 if (ifi <= 0) {
1401                         log_error("Failed to resolve interface %s: %m", *i);
1402                         return -errno;
1403                 }
1404
1405                 sprintf(ifi_str, "n%i", ifi);
1406                 d = udev_device_new_from_device_id(udev, ifi_str);
1407                 if (!d) {
1408                         log_error("Failed to get udev device for interface %s: %m", *i);
1409                         return -errno;
1410                 }
1411
1412                 if (udev_device_get_is_initialized(d) <= 0) {
1413                         log_error("Network interface %s is not initialized yet.", *i);
1414                         return -EBUSY;
1415                 }
1416
1417                 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1418                 if (r < 0) {
1419                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1420                         return r;
1421                 }
1422
1423                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1424                 if (r < 0) {
1425                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1426                         return r;
1427                 }
1428
1429                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1430                 if (r < 0) {
1431                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1432                         return r;
1433                 }
1434         }
1435
1436         return 0;
1437 }
1438
1439 static int audit_still_doesnt_work_in_containers(void) {
1440
1441 #ifdef HAVE_SECCOMP
1442         scmp_filter_ctx seccomp;
1443         int r;
1444
1445         /*
1446            Audit is broken in containers, much of the userspace audit
1447            hookup will fail if running inside a container. We don't
1448            care and just turn off creation of audit sockets.
1449
1450            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1451            with EAFNOSUPPORT which audit userspace uses as indication
1452            that audit is disabled in the kernel.
1453          */
1454
1455         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1456         if (!seccomp)
1457                 return log_oom();
1458
1459         r = seccomp_rule_add_exact(
1460                         seccomp,
1461                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1462                         SCMP_SYS(socket),
1463                         2,
1464                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1465                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1466         if (r < 0) {
1467                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1468                 goto finish;
1469         }
1470
1471         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1472         if (r < 0) {
1473                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1474                 goto finish;
1475         }
1476
1477         r = seccomp_load(seccomp);
1478         if (r < 0)
1479                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1480
1481 finish:
1482         seccomp_release(seccomp);
1483         return r;
1484 #else
1485         return 0;
1486 #endif
1487
1488 }
1489
1490 int main(int argc, char *argv[]) {
1491
1492         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1493         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1494         _cleanup_free_ char *kdbus_domain = NULL;
1495         _cleanup_fdset_free_ FDSet *fds = NULL;
1496         const char *console = NULL;
1497         int r = EXIT_FAILURE, k;
1498         int n_fd_passed;
1499         pid_t pid = 0;
1500         sigset_t mask;
1501
1502         log_parse_environment();
1503         log_open();
1504
1505         k = parse_argv(argc, argv);
1506         if (k < 0)
1507                 goto finish;
1508         else if (k == 0) {
1509                 r = EXIT_SUCCESS;
1510                 goto finish;
1511         }
1512
1513         if (arg_directory) {
1514                 char *p;
1515
1516                 p = path_make_absolute_cwd(arg_directory);
1517                 free(arg_directory);
1518                 arg_directory = p;
1519         } else
1520                 arg_directory = get_current_dir_name();
1521
1522         if (!arg_directory) {
1523                 log_error("Failed to determine path, please use -D.");
1524                 goto finish;
1525         }
1526
1527         path_kill_slashes(arg_directory);
1528
1529         if (!arg_machine) {
1530                 arg_machine = strdup(basename(arg_directory));
1531                 if (!arg_machine) {
1532                         log_oom();
1533                         goto finish;
1534                 }
1535
1536                 hostname_cleanup(arg_machine, false);
1537                 if (isempty(arg_machine)) {
1538                         log_error("Failed to determine machine name automatically, please use -M.");
1539                         goto finish;
1540                 }
1541         }
1542
1543         if (geteuid() != 0) {
1544                 log_error("Need to be root.");
1545                 goto finish;
1546         }
1547
1548         if (sd_booted() <= 0) {
1549                 log_error("Not running on a systemd system.");
1550                 goto finish;
1551         }
1552
1553         if (path_equal(arg_directory, "/")) {
1554                 log_error("Spawning container on root directory not supported.");
1555                 goto finish;
1556         }
1557
1558         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1559                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1560                 goto finish;
1561         }
1562
1563         log_close();
1564         n_fd_passed = sd_listen_fds(false);
1565         if (n_fd_passed > 0) {
1566                 k = fdset_new_listen_fds(&fds, false);
1567                 if (k < 0) {
1568                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1569                         goto finish;
1570                 }
1571         }
1572         fdset_close_others(fds);
1573         log_open();
1574
1575         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1576         if (master < 0) {
1577                 log_error("Failed to acquire pseudo tty: %m");
1578                 goto finish;
1579         }
1580
1581         console = ptsname(master);
1582         if (!console) {
1583                 log_error("Failed to determine tty name: %m");
1584                 goto finish;
1585         }
1586
1587         if (!arg_quiet)
1588                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1589
1590         if (unlockpt(master) < 0) {
1591                 log_error("Failed to unlock tty: %m");
1592                 goto finish;
1593         }
1594
1595         if (arg_network_veth) {
1596                 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1597                 if (netns_fd < 0) {
1598                         log_error("Failed to open network namespace fd: %m");
1599                         goto finish;
1600                 }
1601         }
1602
1603         if (access("/dev/kdbus/control", F_OK) >= 0) {
1604
1605                 if (arg_share_system) {
1606                         kdbus_domain = strdup("/dev/kdbus");
1607                         if (!kdbus_domain) {
1608                                 log_oom();
1609                                 goto finish;
1610                         }
1611                 } else {
1612                         const char *ns;
1613
1614                         ns = strappenda("machine-", arg_machine);
1615                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1616                         if (r < 0)
1617                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1618                         else
1619                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1620                 }
1621         }
1622
1623         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1624                 log_error("Failed to create kmsg socket pair: %m");
1625                 goto finish;
1626         }
1627
1628         sd_notify(0, "READY=1");
1629
1630         assert_se(sigemptyset(&mask) == 0);
1631         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1632         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1633
1634         for (;;) {
1635                 siginfo_t status;
1636
1637                 sync_fd = eventfd(0, EFD_CLOEXEC);
1638                 if (sync_fd < 0) {
1639                         log_error("Failed to create event fd: %m");
1640                         goto finish;
1641                 }
1642
1643                 pid = syscall(__NR_clone,
1644                               SIGCHLD|CLONE_NEWNS|
1645                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1646                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1647                 if (pid < 0) {
1648                         if (errno == EINVAL)
1649                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1650                         else
1651                                 log_error("clone() failed: %m");
1652
1653                         goto finish;
1654                 }
1655
1656                 if (pid == 0) {
1657                         /* child */
1658                         const char *home = NULL;
1659                         uid_t uid = (uid_t) -1;
1660                         gid_t gid = (gid_t) -1;
1661                         unsigned n_env = 2;
1662                         const char *envp[] = {
1663                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1664                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1665                                 NULL, /* TERM */
1666                                 NULL, /* HOME */
1667                                 NULL, /* USER */
1668                                 NULL, /* LOGNAME */
1669                                 NULL, /* container_uuid */
1670                                 NULL, /* LISTEN_FDS */
1671                                 NULL, /* LISTEN_PID */
1672                                 NULL
1673                         };
1674                         char **env_use;
1675                         eventfd_t x;
1676
1677                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1678                         if (envp[n_env])
1679                                 n_env ++;
1680
1681                         close_nointr_nofail(master);
1682                         master = -1;
1683
1684                         close_nointr(STDIN_FILENO);
1685                         close_nointr(STDOUT_FILENO);
1686                         close_nointr(STDERR_FILENO);
1687
1688                         close_nointr_nofail(kmsg_socket_pair[0]);
1689                         kmsg_socket_pair[0] = -1;
1690
1691                         reset_all_signal_handlers();
1692
1693                         assert_se(sigemptyset(&mask) == 0);
1694                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1695
1696                         k = open_terminal(console, O_RDWR);
1697                         if (k != STDIN_FILENO) {
1698                                 if (k >= 0) {
1699                                         close_nointr_nofail(k);
1700                                         k = -EINVAL;
1701                                 }
1702
1703                                 log_error("Failed to open console: %s", strerror(-k));
1704                                 goto child_fail;
1705                         }
1706
1707                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1708                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1709                                 log_error("Failed to duplicate console: %m");
1710                                 goto child_fail;
1711                         }
1712
1713                         if (setsid() < 0) {
1714                                 log_error("setsid() failed: %m");
1715                                 goto child_fail;
1716                         }
1717
1718                         if (reset_audit_loginuid() < 0)
1719                                 goto child_fail;
1720
1721                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1722                                 log_error("PR_SET_PDEATHSIG failed: %m");
1723                                 goto child_fail;
1724                         }
1725
1726                         /* Mark everything as slave, so that we still
1727                          * receive mounts from the real root, but don't
1728                          * propagate mounts to the real root. */
1729                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1730                                 log_error("MS_SLAVE|MS_REC failed: %m");
1731                                 goto child_fail;
1732                         }
1733
1734                         /* Turn directory into bind mount */
1735                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1736                                 log_error("Failed to make bind mount.");
1737                                 goto child_fail;
1738                         }
1739
1740                         if (arg_read_only)
1741                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1742                                         log_error("Failed to make read-only.");
1743                                         goto child_fail;
1744                                 }
1745
1746                         if (mount_all(arg_directory) < 0)
1747                                 goto child_fail;
1748
1749                         if (copy_devnodes(arg_directory) < 0)
1750                                 goto child_fail;
1751
1752                         if (setup_ptmx(arg_directory) < 0)
1753                                 goto child_fail;
1754
1755                         dev_setup(arg_directory);
1756
1757                         if (setup_veth(netns_fd) < 0)
1758                                 goto child_fail;
1759
1760                         if (netns_fd >= 0) {
1761                                 close_nointr_nofail(netns_fd);
1762                                 netns_fd = -1;
1763                         }
1764
1765                         if (audit_still_doesnt_work_in_containers() < 0)
1766                                 goto child_fail;
1767
1768                         if (setup_dev_console(arg_directory, console) < 0)
1769                                 goto child_fail;
1770
1771                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1772                                 goto child_fail;
1773
1774                         close_nointr_nofail(kmsg_socket_pair[1]);
1775                         kmsg_socket_pair[1] = -1;
1776
1777                         if (setup_boot_id(arg_directory) < 0)
1778                                 goto child_fail;
1779
1780                         if (setup_timezone(arg_directory) < 0)
1781                                 goto child_fail;
1782
1783                         if (setup_resolv_conf(arg_directory) < 0)
1784                                 goto child_fail;
1785
1786                         if (setup_journal(arg_directory) < 0)
1787                                 goto child_fail;
1788
1789                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1790                                 goto child_fail;
1791
1792                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1793                                 goto child_fail;
1794
1795                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1796                                 goto child_fail;
1797
1798                         if (chdir(arg_directory) < 0) {
1799                                 log_error("chdir(%s) failed: %m", arg_directory);
1800                                 goto child_fail;
1801                         }
1802
1803                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1804                                 log_error("mount(MS_MOVE) failed: %m");
1805                                 goto child_fail;
1806                         }
1807
1808                         if (chroot(".") < 0) {
1809                                 log_error("chroot() failed: %m");
1810                                 goto child_fail;
1811                         }
1812
1813                         if (chdir("/") < 0) {
1814                                 log_error("chdir() failed: %m");
1815                                 goto child_fail;
1816                         }
1817
1818                         umask(0022);
1819
1820                         if (arg_private_network)
1821                                 loopback_setup();
1822
1823                         if (drop_capabilities() < 0) {
1824                                 log_error("drop_capabilities() failed: %m");
1825                                 goto child_fail;
1826                         }
1827
1828                         if (arg_user) {
1829
1830                                 /* Note that this resolves user names
1831                                  * inside the container, and hence
1832                                  * accesses the NSS modules from the
1833                                  * container and not the host. This is
1834                                  * a bit weird... */
1835
1836                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1837                                         log_error("get_user_creds() failed: %m");
1838                                         goto child_fail;
1839                                 }
1840
1841                                 if (mkdir_parents_label(home, 0775) < 0) {
1842                                         log_error("mkdir_parents_label() failed: %m");
1843                                         goto child_fail;
1844                                 }
1845
1846                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1847                                         log_error("mkdir_safe_label() failed: %m");
1848                                         goto child_fail;
1849                                 }
1850
1851                                 if (initgroups((const char*)arg_user, gid) < 0) {
1852                                         log_error("initgroups() failed: %m");
1853                                         goto child_fail;
1854                                 }
1855
1856                                 if (setresgid(gid, gid, gid) < 0) {
1857                                         log_error("setregid() failed: %m");
1858                                         goto child_fail;
1859                                 }
1860
1861                                 if (setresuid(uid, uid, uid) < 0) {
1862                                         log_error("setreuid() failed: %m");
1863                                         goto child_fail;
1864                                 }
1865                         } else {
1866                                 /* Reset everything fully to 0, just in case */
1867
1868                                 if (setgroups(0, NULL) < 0) {
1869                                         log_error("setgroups() failed: %m");
1870                                         goto child_fail;
1871                                 }
1872
1873                                 if (setresgid(0, 0, 0) < 0) {
1874                                         log_error("setregid() failed: %m");
1875                                         goto child_fail;
1876                                 }
1877
1878                                 if (setresuid(0, 0, 0) < 0) {
1879                                         log_error("setreuid() failed: %m");
1880                                         goto child_fail;
1881                                 }
1882                         }
1883
1884                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1885                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1886                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1887                                 log_oom();
1888                                 goto child_fail;
1889                         }
1890
1891                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1892                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1893                                         log_oom();
1894                                         goto child_fail;
1895                                 }
1896                         }
1897
1898                         if (fdset_size(fds) > 0) {
1899                                 k = fdset_cloexec(fds, false);
1900                                 if (k < 0) {
1901                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1902                                         goto child_fail;
1903                                 }
1904
1905                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1906                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1907                                         log_oom();
1908                                         goto child_fail;
1909                                 }
1910                         }
1911
1912                         setup_hostname();
1913
1914                         eventfd_read(sync_fd, &x);
1915                         close_nointr_nofail(sync_fd);
1916                         sync_fd = -1;
1917
1918                         if (!strv_isempty(arg_setenv)) {
1919                                 char **n;
1920
1921                                 n = strv_env_merge(2, envp, arg_setenv);
1922                                 if (!n) {
1923                                         log_oom();
1924                                         goto child_fail;
1925                                 }
1926
1927                                 env_use = n;
1928                         } else
1929                                 env_use = (char**) envp;
1930
1931 #ifdef HAVE_SELINUX
1932                         if (arg_selinux_context)
1933                                 if (setexeccon(arg_selinux_context) < 0)
1934                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1935 #endif
1936                         if (arg_boot) {
1937                                 char **a;
1938                                 size_t l;
1939
1940                                 /* Automatically search for the init system */
1941
1942                                 l = 1 + argc - optind;
1943                                 a = newa(char*, l + 1);
1944                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1945
1946                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1947                                 execve(a[0], a, env_use);
1948
1949                                 a[0] = (char*) "/lib/systemd/systemd";
1950                                 execve(a[0], a, env_use);
1951
1952                                 a[0] = (char*) "/sbin/init";
1953                                 execve(a[0], a, env_use);
1954                         } else if (argc > optind)
1955                                 execvpe(argv[optind], argv + optind, env_use);
1956                         else {
1957                                 chdir(home ? home : "/root");
1958                                 execle("/bin/bash", "-bash", NULL, env_use);
1959                         }
1960
1961                         log_error("execv() failed: %m");
1962
1963                 child_fail:
1964                         _exit(EXIT_FAILURE);
1965                 }
1966
1967                 fdset_free(fds);
1968                 fds = NULL;
1969
1970                 r = register_machine(pid);
1971                 if (r < 0)
1972                         goto finish;
1973
1974                 r = move_network_interfaces(pid);
1975                 if (r < 0)
1976                         goto finish;
1977
1978                 eventfd_write(sync_fd, 1);
1979                 close_nointr_nofail(sync_fd);
1980                 sync_fd = -1;
1981
1982                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1983                 if (k < 0) {
1984                         r = EXIT_FAILURE;
1985                         break;
1986                 }
1987
1988                 if (!arg_quiet)
1989                         putc('\n', stdout);
1990
1991                 /* Kill if it is not dead yet anyway */
1992                 terminate_machine(pid);
1993
1994                 /* Redundant, but better safe than sorry */
1995                 kill(pid, SIGKILL);
1996
1997                 k = wait_for_terminate(pid, &status);
1998                 pid = 0;
1999
2000                 if (k < 0) {
2001                         r = EXIT_FAILURE;
2002                         break;
2003                 }
2004
2005                 if (status.si_code == CLD_EXITED) {
2006                         r = status.si_status;
2007                         if (status.si_status != 0) {
2008                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2009                                 break;
2010                         }
2011
2012                         if (!arg_quiet)
2013                                 log_debug("Container %s exited successfully.", arg_machine);
2014                         break;
2015                 } else if (status.si_code == CLD_KILLED &&
2016                            status.si_status == SIGINT) {
2017
2018                         if (!arg_quiet)
2019                                 log_info("Container %s has been shut down.", arg_machine);
2020                         r = 0;
2021                         break;
2022                 } else if (status.si_code == CLD_KILLED &&
2023                            status.si_status == SIGHUP) {
2024
2025                         if (!arg_quiet)
2026                                 log_info("Container %s is being rebooted.", arg_machine);
2027                         continue;
2028                 } else if (status.si_code == CLD_KILLED ||
2029                            status.si_code == CLD_DUMPED) {
2030
2031                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2032                         r = EXIT_FAILURE;
2033                         break;
2034                 } else {
2035                         log_error("Container %s failed due to unknown reason.", arg_machine);
2036                         r = EXIT_FAILURE;
2037                         break;
2038                 }
2039         }
2040
2041 finish:
2042         if (pid > 0)
2043                 kill(pid, SIGKILL);
2044
2045         free(arg_directory);
2046         free(arg_machine);
2047         free(arg_setenv);
2048         free(arg_network_interfaces);
2049
2050         return r;
2051 }