chiark / gitweb /
nspawn: check with udev before we take possession of an interface
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #include "sd-daemon.h"
52 #include "sd-bus.h"
53 #include "sd-id128.h"
54 #include "sd-rtnl.h"
55 #include "log.h"
56 #include "util.h"
57 #include "mkdir.h"
58 #include "macro.h"
59 #include "audit.h"
60 #include "missing.h"
61 #include "cgroup-util.h"
62 #include "strv.h"
63 #include "path-util.h"
64 #include "loopback-setup.h"
65 #include "dev-setup.h"
66 #include "fdset.h"
67 #include "build.h"
68 #include "fileio.h"
69 #include "bus-util.h"
70 #include "bus-error.h"
71 #include "ptyfwd.h"
72 #include "bus-kernel.h"
73 #include "env-util.h"
74 #include "def.h"
75 #include "rtnl-util.h"
76 #include "udev-util.h"
77
78 typedef enum LinkJournal {
79         LINK_NO,
80         LINK_AUTO,
81         LINK_HOST,
82         LINK_GUEST
83 } LinkJournal;
84
85 static char *arg_directory = NULL;
86 static char *arg_user = NULL;
87 static sd_id128_t arg_uuid = {};
88 static char *arg_machine = NULL;
89 static char *arg_selinux_context = NULL;
90 static char *arg_selinux_apifs_context = NULL;
91 static const char *arg_slice = NULL;
92 static bool arg_private_network = false;
93 static bool arg_read_only = false;
94 static bool arg_boot = false;
95 static LinkJournal arg_link_journal = LINK_AUTO;
96 static uint64_t arg_retain =
97         (1ULL << CAP_CHOWN) |
98         (1ULL << CAP_DAC_OVERRIDE) |
99         (1ULL << CAP_DAC_READ_SEARCH) |
100         (1ULL << CAP_FOWNER) |
101         (1ULL << CAP_FSETID) |
102         (1ULL << CAP_IPC_OWNER) |
103         (1ULL << CAP_KILL) |
104         (1ULL << CAP_LEASE) |
105         (1ULL << CAP_LINUX_IMMUTABLE) |
106         (1ULL << CAP_NET_BIND_SERVICE) |
107         (1ULL << CAP_NET_BROADCAST) |
108         (1ULL << CAP_NET_RAW) |
109         (1ULL << CAP_SETGID) |
110         (1ULL << CAP_SETFCAP) |
111         (1ULL << CAP_SETPCAP) |
112         (1ULL << CAP_SETUID) |
113         (1ULL << CAP_SYS_ADMIN) |
114         (1ULL << CAP_SYS_CHROOT) |
115         (1ULL << CAP_SYS_NICE) |
116         (1ULL << CAP_SYS_PTRACE) |
117         (1ULL << CAP_SYS_TTY_CONFIG) |
118         (1ULL << CAP_SYS_RESOURCE) |
119         (1ULL << CAP_SYS_BOOT) |
120         (1ULL << CAP_AUDIT_WRITE) |
121         (1ULL << CAP_AUDIT_CONTROL) |
122         (1ULL << CAP_MKNOD);
123 static char **arg_bind = NULL;
124 static char **arg_bind_ro = NULL;
125 static char **arg_setenv = NULL;
126 static bool arg_quiet = false;
127 static bool arg_share_system = false;
128 static bool arg_register = true;
129 static bool arg_keep_unit = false;
130 static char **arg_network_interfaces = NULL;
131
132 static int help(void) {
133
134         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
135                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
136                "  -h --help                 Show this help\n"
137                "     --version              Print version string\n"
138                "  -D --directory=NAME       Root directory for the container\n"
139                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
140                "  -u --user=USER            Run the command under specified user or uid\n"
141                "     --uuid=UUID            Set a specific machine UUID for the container\n"
142                "  -M --machine=NAME         Set the machine name for the container\n"
143                "  -S --slice=SLICE          Place the container in the specified slice\n"
144                "  -Z --selinux-context=SECLABEL\n"
145                "                            Set the SELinux security context to be used by\n"
146                "                            processes in the container\n"
147                "  -L --selinux-apifs-context=SECLABEL\n"
148                "                            Set the SELinux security context to be used by\n"
149                "                            API/tmpfs file systems in the container\n"
150                "     --private-network      Disable network in container\n"
151                "     --network-interface=INTERFACE\n"
152                "                            Assign an existing network interface to the container\n"
153                "     --share-system         Share system namespaces with host\n"
154                "     --read-only            Mount the root directory read-only\n"
155                "     --capability=CAP       In addition to the default, retain specified\n"
156                "                            capability\n"
157                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
158                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
159                "  -j                        Equivalent to --link-journal=host\n"
160                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
161                "                            the container\n"
162                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
163                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
164                "     --register=BOOLEAN     Register container as machine\n"
165                "     --keep-unit            Do not register a scope for the machine, reuse\n"
166                "                            the service unit nspawn is running in\n"
167                "  -q --quiet                Do not show status information\n",
168                program_invocation_short_name);
169
170         return 0;
171 }
172
173 static int parse_argv(int argc, char *argv[]) {
174
175         enum {
176                 ARG_VERSION = 0x100,
177                 ARG_PRIVATE_NETWORK,
178                 ARG_UUID,
179                 ARG_READ_ONLY,
180                 ARG_CAPABILITY,
181                 ARG_DROP_CAPABILITY,
182                 ARG_LINK_JOURNAL,
183                 ARG_BIND,
184                 ARG_BIND_RO,
185                 ARG_SETENV,
186                 ARG_SHARE_SYSTEM,
187                 ARG_REGISTER,
188                 ARG_KEEP_UNIT,
189                 ARG_NETWORK_INTERFACE
190         };
191
192         static const struct option options[] = {
193                 { "help",                  no_argument,       NULL, 'h'                   },
194                 { "version",               no_argument,       NULL, ARG_VERSION           },
195                 { "directory",             required_argument, NULL, 'D'                   },
196                 { "user",                  required_argument, NULL, 'u'                   },
197                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
198                 { "boot",                  no_argument,       NULL, 'b'                   },
199                 { "uuid",                  required_argument, NULL, ARG_UUID              },
200                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
201                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
202                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
203                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
204                 { "bind",                  required_argument, NULL, ARG_BIND              },
205                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
206                 { "machine",               required_argument, NULL, 'M'                   },
207                 { "slice",                 required_argument, NULL, 'S'                   },
208                 { "setenv",                required_argument, NULL, ARG_SETENV            },
209                 { "selinux-context",       required_argument, NULL, 'Z'                   },
210                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
211                 { "quiet",                 no_argument,       NULL, 'q'                   },
212                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
213                 { "register",              required_argument, NULL, ARG_REGISTER          },
214                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
215                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
216                 {}
217         };
218
219         int c, r;
220         uint64_t plus = 0, minus = 0;
221
222         assert(argc >= 0);
223         assert(argv);
224
225         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
226
227                 switch (c) {
228
229                 case 'h':
230                         return help();
231
232                 case ARG_VERSION:
233                         puts(PACKAGE_STRING);
234                         puts(SYSTEMD_FEATURES);
235                         return 0;
236
237                 case 'D':
238                         free(arg_directory);
239                         arg_directory = canonicalize_file_name(optarg);
240                         if (!arg_directory) {
241                                 log_error("Invalid root directory: %m");
242                                 return -ENOMEM;
243                         }
244
245                         break;
246
247                 case 'u':
248                         free(arg_user);
249                         arg_user = strdup(optarg);
250                         if (!arg_user)
251                                 return log_oom();
252
253                         break;
254
255                 case ARG_NETWORK_INTERFACE:
256                         if (strv_push(&arg_network_interfaces, optarg) < 0)
257                                 return log_oom();
258
259                         /* fall through */
260
261                 case ARG_PRIVATE_NETWORK:
262                         arg_private_network = true;
263                         break;
264
265                 case 'b':
266                         arg_boot = true;
267                         break;
268
269                 case ARG_UUID:
270                         r = sd_id128_from_string(optarg, &arg_uuid);
271                         if (r < 0) {
272                                 log_error("Invalid UUID: %s", optarg);
273                                 return r;
274                         }
275                         break;
276
277                 case 'S':
278                         arg_slice = strdup(optarg);
279                         if (!arg_slice)
280                                 return log_oom();
281
282                         break;
283
284                 case 'M':
285                         if (isempty(optarg)) {
286                                 free(arg_machine);
287                                 arg_machine = NULL;
288                         } else {
289
290                                 if (!hostname_is_valid(optarg)) {
291                                         log_error("Invalid machine name: %s", optarg);
292                                         return -EINVAL;
293                                 }
294
295                                 free(arg_machine);
296                                 arg_machine = strdup(optarg);
297                                 if (!arg_machine)
298                                         return log_oom();
299
300                                 break;
301                         }
302
303                 case 'Z':
304                         arg_selinux_context = optarg;
305                         break;
306
307                 case 'L':
308                         arg_selinux_apifs_context = optarg;
309                         break;
310
311                 case ARG_READ_ONLY:
312                         arg_read_only = true;
313                         break;
314
315                 case ARG_CAPABILITY:
316                 case ARG_DROP_CAPABILITY: {
317                         char *state, *word;
318                         size_t length;
319
320                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
321                                 _cleanup_free_ char *t;
322                                 cap_value_t cap;
323
324                                 t = strndup(word, length);
325                                 if (!t)
326                                         return log_oom();
327
328                                 if (streq(t, "all")) {
329                                         if (c == ARG_CAPABILITY)
330                                                 plus = (uint64_t) -1;
331                                         else
332                                                 minus = (uint64_t) -1;
333                                 } else {
334                                         if (cap_from_name(t, &cap) < 0) {
335                                                 log_error("Failed to parse capability %s.", t);
336                                                 return -EINVAL;
337                                         }
338
339                                         if (c == ARG_CAPABILITY)
340                                                 plus |= 1ULL << (uint64_t) cap;
341                                         else
342                                                 minus |= 1ULL << (uint64_t) cap;
343                                 }
344                         }
345
346                         break;
347                 }
348
349                 case 'j':
350                         arg_link_journal = LINK_GUEST;
351                         break;
352
353                 case ARG_LINK_JOURNAL:
354                         if (streq(optarg, "auto"))
355                                 arg_link_journal = LINK_AUTO;
356                         else if (streq(optarg, "no"))
357                                 arg_link_journal = LINK_NO;
358                         else if (streq(optarg, "guest"))
359                                 arg_link_journal = LINK_GUEST;
360                         else if (streq(optarg, "host"))
361                                 arg_link_journal = LINK_HOST;
362                         else {
363                                 log_error("Failed to parse link journal mode %s", optarg);
364                                 return -EINVAL;
365                         }
366
367                         break;
368
369                 case ARG_BIND:
370                 case ARG_BIND_RO: {
371                         _cleanup_free_ char *a = NULL, *b = NULL;
372                         char *e;
373                         char ***x;
374
375                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
376
377                         e = strchr(optarg, ':');
378                         if (e) {
379                                 a = strndup(optarg, e - optarg);
380                                 b = strdup(e + 1);
381                         } else {
382                                 a = strdup(optarg);
383                                 b = strdup(optarg);
384                         }
385
386                         if (!a || !b)
387                                 return log_oom();
388
389                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
390                                 log_error("Invalid bind mount specification: %s", optarg);
391                                 return -EINVAL;
392                         }
393
394                         r = strv_extend(x, a);
395                         if (r < 0)
396                                 return log_oom();
397
398                         r = strv_extend(x, b);
399                         if (r < 0)
400                                 return log_oom();
401
402                         break;
403                 }
404
405                 case ARG_SETENV: {
406                         char **n;
407
408                         if (!env_assignment_is_valid(optarg)) {
409                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
410                                 return -EINVAL;
411                         }
412
413                         n = strv_env_set(arg_setenv, optarg);
414                         if (!n)
415                                 return log_oom();
416
417                         strv_free(arg_setenv);
418                         arg_setenv = n;
419                         break;
420                 }
421
422                 case 'q':
423                         arg_quiet = true;
424                         break;
425
426                 case ARG_SHARE_SYSTEM:
427                         arg_share_system = true;
428                         break;
429
430                 case ARG_REGISTER:
431                         r = parse_boolean(optarg);
432                         if (r < 0) {
433                                 log_error("Failed to parse --register= argument: %s", optarg);
434                                 return r;
435                         }
436
437                         arg_register = r;
438                         break;
439
440                 case ARG_KEEP_UNIT:
441                         arg_keep_unit = true;
442                         break;
443
444                 case '?':
445                         return -EINVAL;
446
447                 default:
448                         assert_not_reached("Unhandled option");
449                 }
450         }
451
452         if (arg_share_system)
453                 arg_register = false;
454
455         if (arg_boot && arg_share_system) {
456                 log_error("--boot and --share-system may not be combined.");
457                 return -EINVAL;
458         }
459
460         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
461                 log_error("--keep-unit may not be used when invoked from a user session.");
462                 return -EINVAL;
463         }
464
465         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
466
467         return 1;
468 }
469
470 static int mount_all(const char *dest) {
471
472         typedef struct MountPoint {
473                 const char *what;
474                 const char *where;
475                 const char *type;
476                 const char *options;
477                 unsigned long flags;
478                 bool fatal;
479         } MountPoint;
480
481         static const MountPoint mount_table[] = {
482                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
483                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
484                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
485                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
486                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
487                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
488                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
489                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
490 #ifdef HAVE_SELINUX
491                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
492                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
493 #endif
494         };
495
496         unsigned k;
497         int r = 0;
498
499         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
500                 _cleanup_free_ char *where = NULL;
501 #ifdef HAVE_SELINUX
502                 _cleanup_free_ char *options = NULL;
503 #endif
504                 const char *o;
505                 int t;
506
507                 where = strjoin(dest, "/", mount_table[k].where, NULL);
508                 if (!where)
509                         return log_oom();
510
511                 t = path_is_mount_point(where, true);
512                 if (t < 0) {
513                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
514
515                         if (r == 0)
516                                 r = t;
517
518                         continue;
519                 }
520
521                 /* Skip this entry if it is not a remount. */
522                 if (mount_table[k].what && t > 0)
523                         continue;
524
525                 mkdir_p(where, 0755);
526
527 #ifdef HAVE_SELINUX
528                 if (arg_selinux_apifs_context &&
529                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
530                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
531                         if (!options)
532                                 return log_oom();
533
534                         o = options;
535                 } else
536 #endif
537                         o = mount_table[k].options;
538
539
540                 if (mount(mount_table[k].what,
541                           where,
542                           mount_table[k].type,
543                           mount_table[k].flags,
544                           o) < 0 &&
545                     mount_table[k].fatal) {
546
547                         log_error("mount(%s) failed: %m", where);
548
549                         if (r == 0)
550                                 r = -errno;
551                 }
552         }
553
554         return r;
555 }
556
557 static int mount_binds(const char *dest, char **l, unsigned long flags) {
558         char **x, **y;
559
560         STRV_FOREACH_PAIR(x, y, l) {
561                 char *where;
562                 struct stat source_st, dest_st;
563                 int r;
564
565                 if (stat(*x, &source_st) < 0) {
566                         log_error("failed to stat %s: %m", *x);
567                         return -errno;
568                 }
569
570                 where = strappenda(dest, *y);
571                 r = stat(where, &dest_st);
572                 if (r == 0) {
573                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
574                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
575                                                 *x, where);
576                                 return -EINVAL;
577                         }
578                 } else if (errno == ENOENT) {
579                         r = mkdir_parents_label(where, 0755);
580                         if (r < 0) {
581                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
582                                 return r;
583                         }
584                 } else {
585                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
586                         return -errno;
587                 }
588                 /* Create the mount point, but be conservative -- refuse to create block
589                 * and char devices. */
590                 if (S_ISDIR(source_st.st_mode))
591                         mkdir_label(where, 0755);
592                 else if (S_ISFIFO(source_st.st_mode))
593                         mkfifo(where, 0644);
594                 else if (S_ISSOCK(source_st.st_mode))
595                         mknod(where, 0644 | S_IFSOCK, 0);
596                 else if (S_ISREG(source_st.st_mode))
597                         touch(where);
598                 else {
599                         log_error("Refusing to create mountpoint for file: %s", *x);
600                         return -ENOTSUP;
601                 }
602
603                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
604                         log_error("mount(%s) failed: %m", where);
605                         return -errno;
606                 }
607
608                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
609                         log_error("mount(%s) failed: %m", where);
610                         return -errno;
611                 }
612         }
613
614         return 0;
615 }
616
617 static int setup_timezone(const char *dest) {
618         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
619         char *z, *y;
620         int r;
621
622         assert(dest);
623
624         /* Fix the timezone, if possible */
625         r = readlink_malloc("/etc/localtime", &p);
626         if (r < 0) {
627                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
628                 return 0;
629         }
630
631         z = path_startswith(p, "../usr/share/zoneinfo/");
632         if (!z)
633                 z = path_startswith(p, "/usr/share/zoneinfo/");
634         if (!z) {
635                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
636                 return 0;
637         }
638
639         where = strappend(dest, "/etc/localtime");
640         if (!where)
641                 return log_oom();
642
643         r = readlink_malloc(where, &q);
644         if (r >= 0) {
645                 y = path_startswith(q, "../usr/share/zoneinfo/");
646                 if (!y)
647                         y = path_startswith(q, "/usr/share/zoneinfo/");
648
649
650                 /* Already pointing to the right place? Then do nothing .. */
651                 if (y && streq(y, z))
652                         return 0;
653         }
654
655         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
656         if (!check)
657                 return log_oom();
658
659         if (access(check, F_OK) < 0) {
660                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
661                 return 0;
662         }
663
664         what = strappend("../usr/share/zoneinfo/", z);
665         if (!what)
666                 return log_oom();
667
668         unlink(where);
669         if (symlink(what, where) < 0) {
670                 log_error("Failed to correct timezone of container: %m");
671                 return 0;
672         }
673
674         return 0;
675 }
676
677 static int setup_resolv_conf(const char *dest) {
678         char _cleanup_free_ *where = NULL;
679
680         assert(dest);
681
682         if (arg_private_network)
683                 return 0;
684
685         /* Fix resolv.conf, if possible */
686         where = strappend(dest, "/etc/resolv.conf");
687         if (!where)
688                 return log_oom();
689
690         /* We don't really care for the results of this really. If it
691          * fails, it fails, but meh... */
692         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
693
694         return 0;
695 }
696
697 static int setup_boot_id(const char *dest) {
698         _cleanup_free_ char *from = NULL, *to = NULL;
699         sd_id128_t rnd;
700         char as_uuid[37];
701         int r;
702
703         assert(dest);
704
705         if (arg_share_system)
706                 return 0;
707
708         /* Generate a new randomized boot ID, so that each boot-up of
709          * the container gets a new one */
710
711         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
712         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
713         if (!from || !to)
714                 return log_oom();
715
716         r = sd_id128_randomize(&rnd);
717         if (r < 0) {
718                 log_error("Failed to generate random boot id: %s", strerror(-r));
719                 return r;
720         }
721
722         snprintf(as_uuid, sizeof(as_uuid),
723                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
724                  SD_ID128_FORMAT_VAL(rnd));
725         char_array_0(as_uuid);
726
727         r = write_string_file(from, as_uuid);
728         if (r < 0) {
729                 log_error("Failed to write boot id: %s", strerror(-r));
730                 return r;
731         }
732
733         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
734                 log_error("Failed to bind mount boot id: %m");
735                 r = -errno;
736         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
737                 log_warning("Failed to make boot id read-only: %m");
738
739         unlink(from);
740         return r;
741 }
742
743 static int copy_devnodes(const char *dest) {
744
745         static const char devnodes[] =
746                 "null\0"
747                 "zero\0"
748                 "full\0"
749                 "random\0"
750                 "urandom\0"
751                 "tty\0";
752
753         const char *d;
754         int r = 0;
755         _cleanup_umask_ mode_t u;
756
757         assert(dest);
758
759         u = umask(0000);
760
761         NULSTR_FOREACH(d, devnodes) {
762                 _cleanup_free_ char *from = NULL, *to = NULL;
763                 struct stat st;
764
765                 from = strappend("/dev/", d);
766                 to = strjoin(dest, "/dev/", d, NULL);
767                 if (!from || !to)
768                         return log_oom();
769
770                 if (stat(from, &st) < 0) {
771
772                         if (errno != ENOENT) {
773                                 log_error("Failed to stat %s: %m", from);
774                                 return -errno;
775                         }
776
777                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
778
779                         log_error("%s is not a char or block device, cannot copy", from);
780                         return -EIO;
781
782                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
783
784                         log_error("mknod(%s) failed: %m", dest);
785                         return  -errno;
786                 }
787         }
788
789         return r;
790 }
791
792 static int setup_ptmx(const char *dest) {
793         _cleanup_free_ char *p = NULL;
794
795         p = strappend(dest, "/dev/ptmx");
796         if (!p)
797                 return log_oom();
798
799         if (symlink("pts/ptmx", p) < 0) {
800                 log_error("Failed to create /dev/ptmx symlink: %m");
801                 return -errno;
802         }
803
804         return 0;
805 }
806
807 static int setup_dev_console(const char *dest, const char *console) {
808         struct stat st;
809         _cleanup_free_ char *to = NULL;
810         int r;
811         _cleanup_umask_ mode_t u;
812
813         assert(dest);
814         assert(console);
815
816         u = umask(0000);
817
818         if (stat(console, &st) < 0) {
819                 log_error("Failed to stat %s: %m", console);
820                 return -errno;
821
822         } else if (!S_ISCHR(st.st_mode)) {
823                 log_error("/dev/console is not a char device");
824                 return -EIO;
825         }
826
827         r = chmod_and_chown(console, 0600, 0, 0);
828         if (r < 0) {
829                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
830                 return r;
831         }
832
833         if (asprintf(&to, "%s/dev/console", dest) < 0)
834                 return log_oom();
835
836         /* We need to bind mount the right tty to /dev/console since
837          * ptys can only exist on pts file systems. To have something
838          * to bind mount things on we create a device node first, that
839          * has the right major/minor (note that the major minor
840          * doesn't actually matter here, since we mount it over
841          * anyway). */
842
843         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
844                 log_error("mknod() for /dev/console failed: %m");
845                 return -errno;
846         }
847
848         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
849                 log_error("Bind mount for /dev/console failed: %m");
850                 return -errno;
851         }
852
853         return 0;
854 }
855
856 static int setup_kmsg(const char *dest, int kmsg_socket) {
857         _cleanup_free_ char *from = NULL, *to = NULL;
858         int r, fd, k;
859         _cleanup_umask_ mode_t u;
860         union {
861                 struct cmsghdr cmsghdr;
862                 uint8_t buf[CMSG_SPACE(sizeof(int))];
863         } control = {};
864         struct msghdr mh = {
865                 .msg_control = &control,
866                 .msg_controllen = sizeof(control),
867         };
868         struct cmsghdr *cmsg;
869
870         assert(dest);
871         assert(kmsg_socket >= 0);
872
873         u = umask(0000);
874
875         /* We create the kmsg FIFO as /dev/kmsg, but immediately
876          * delete it after bind mounting it to /proc/kmsg. While FIFOs
877          * on the reading side behave very similar to /proc/kmsg,
878          * their writing side behaves differently from /dev/kmsg in
879          * that writing blocks when nothing is reading. In order to
880          * avoid any problems with containers deadlocking due to this
881          * we simply make /dev/kmsg unavailable to the container. */
882         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
883             asprintf(&to, "%s/proc/kmsg", dest) < 0)
884                 return log_oom();
885
886         if (mkfifo(from, 0600) < 0) {
887                 log_error("mkfifo() for /dev/kmsg failed: %m");
888                 return -errno;
889         }
890
891         r = chmod_and_chown(from, 0600, 0, 0);
892         if (r < 0) {
893                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
894                 return r;
895         }
896
897         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
898                 log_error("Bind mount for /proc/kmsg failed: %m");
899                 return -errno;
900         }
901
902         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
903         if (fd < 0) {
904                 log_error("Failed to open fifo: %m");
905                 return -errno;
906         }
907
908         cmsg = CMSG_FIRSTHDR(&mh);
909         cmsg->cmsg_level = SOL_SOCKET;
910         cmsg->cmsg_type = SCM_RIGHTS;
911         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
912         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
913
914         mh.msg_controllen = cmsg->cmsg_len;
915
916         /* Store away the fd in the socket, so that it stays open as
917          * long as we run the child */
918         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
919         close_nointr_nofail(fd);
920
921         if (k < 0) {
922                 log_error("Failed to send FIFO fd: %m");
923                 return -errno;
924         }
925
926         /* And now make the FIFO unavailable as /dev/kmsg... */
927         unlink(from);
928         return 0;
929 }
930
931 static int setup_hostname(void) {
932
933         if (arg_share_system)
934                 return 0;
935
936         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
937                 return -errno;
938
939         return 0;
940 }
941
942 static int setup_journal(const char *directory) {
943         sd_id128_t machine_id, this_id;
944         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
945         char *id;
946         int r;
947
948         p = strappend(directory, "/etc/machine-id");
949         if (!p)
950                 return log_oom();
951
952         r = read_one_line_file(p, &b);
953         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
954                 return 0;
955         else if (r < 0) {
956                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
957                 return r;
958         }
959
960         id = strstrip(b);
961         if (isempty(id) && arg_link_journal == LINK_AUTO)
962                 return 0;
963
964         /* Verify validity */
965         r = sd_id128_from_string(id, &machine_id);
966         if (r < 0) {
967                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
968                 return r;
969         }
970
971         r = sd_id128_get_machine(&this_id);
972         if (r < 0) {
973                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
974                 return r;
975         }
976
977         if (sd_id128_equal(machine_id, this_id)) {
978                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
979                          "Host and machine ids are equal (%s): refusing to link journals", id);
980                 if (arg_link_journal == LINK_AUTO)
981                         return 0;
982                 return
983                         -EEXIST;
984         }
985
986         if (arg_link_journal == LINK_NO)
987                 return 0;
988
989         free(p);
990         p = strappend("/var/log/journal/", id);
991         q = strjoin(directory, "/var/log/journal/", id, NULL);
992         if (!p || !q)
993                 return log_oom();
994
995         if (path_is_mount_point(p, false) > 0) {
996                 if (arg_link_journal != LINK_AUTO) {
997                         log_error("%s: already a mount point, refusing to use for journal", p);
998                         return -EEXIST;
999                 }
1000
1001                 return 0;
1002         }
1003
1004         if (path_is_mount_point(q, false) > 0) {
1005                 if (arg_link_journal != LINK_AUTO) {
1006                         log_error("%s: already a mount point, refusing to use for journal", q);
1007                         return -EEXIST;
1008                 }
1009
1010                 return 0;
1011         }
1012
1013         r = readlink_and_make_absolute(p, &d);
1014         if (r >= 0) {
1015                 if ((arg_link_journal == LINK_GUEST ||
1016                      arg_link_journal == LINK_AUTO) &&
1017                     path_equal(d, q)) {
1018
1019                         r = mkdir_p(q, 0755);
1020                         if (r < 0)
1021                                 log_warning("failed to create directory %s: %m", q);
1022                         return 0;
1023                 }
1024
1025                 if (unlink(p) < 0) {
1026                         log_error("Failed to remove symlink %s: %m", p);
1027                         return -errno;
1028                 }
1029         } else if (r == -EINVAL) {
1030
1031                 if (arg_link_journal == LINK_GUEST &&
1032                     rmdir(p) < 0) {
1033
1034                         if (errno == ENOTDIR) {
1035                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1036                                 return r;
1037                         } else {
1038                                 log_error("Failed to remove %s: %m", p);
1039                                 return -errno;
1040                         }
1041                 }
1042         } else if (r != -ENOENT) {
1043                 log_error("readlink(%s) failed: %m", p);
1044                 return r;
1045         }
1046
1047         if (arg_link_journal == LINK_GUEST) {
1048
1049                 if (symlink(q, p) < 0) {
1050                         log_error("Failed to symlink %s to %s: %m", q, p);
1051                         return -errno;
1052                 }
1053
1054                 r = mkdir_p(q, 0755);
1055                 if (r < 0)
1056                         log_warning("failed to create directory %s: %m", q);
1057                 return 0;
1058         }
1059
1060         if (arg_link_journal == LINK_HOST) {
1061                 r = mkdir_p(p, 0755);
1062                 if (r < 0) {
1063                         log_error("Failed to create %s: %m", p);
1064                         return r;
1065                 }
1066
1067         } else if (access(p, F_OK) < 0)
1068                 return 0;
1069
1070         if (dir_is_empty(q) == 0) {
1071                 log_error("%s not empty.", q);
1072                 return -ENOTEMPTY;
1073         }
1074
1075         r = mkdir_p(q, 0755);
1076         if (r < 0) {
1077                 log_error("Failed to create %s: %m", q);
1078                 return r;
1079         }
1080
1081         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1082                 log_error("Failed to bind mount journal from host into guest: %m");
1083                 return -errno;
1084         }
1085
1086         return 0;
1087 }
1088
1089 static int setup_kdbus(const char *dest, const char *path) {
1090         const char *p;
1091
1092         if (!path)
1093                 return 0;
1094
1095         p = strappenda(dest, "/dev/kdbus");
1096         if (mkdir(p, 0755) < 0) {
1097                 log_error("Failed to create kdbus path: %m");
1098                 return  -errno;
1099         }
1100
1101         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1102                 log_error("Failed to mount kdbus domain path: %m");
1103                 return -errno;
1104         }
1105
1106         return 0;
1107 }
1108
1109 static int drop_capabilities(void) {
1110         return capability_bounding_set_drop(~arg_retain, false);
1111 }
1112
1113 static int register_machine(pid_t pid) {
1114         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1115         _cleanup_bus_unref_ sd_bus *bus = NULL;
1116         int r;
1117
1118         if (!arg_register)
1119                 return 0;
1120
1121         r = sd_bus_default_system(&bus);
1122         if (r < 0) {
1123                 log_error("Failed to open system bus: %s", strerror(-r));
1124                 return r;
1125         }
1126
1127         if (arg_keep_unit) {
1128                 r = sd_bus_call_method(
1129                                 bus,
1130                                 "org.freedesktop.machine1",
1131                                 "/org/freedesktop/machine1",
1132                                 "org.freedesktop.machine1.Manager",
1133                                 "RegisterMachine",
1134                                 &error,
1135                                 NULL,
1136                                 "sayssus",
1137                                 arg_machine,
1138                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1139                                 "nspawn",
1140                                 "container",
1141                                 (uint32_t) pid,
1142                                 strempty(arg_directory));
1143         } else {
1144                 r = sd_bus_call_method(
1145                                 bus,
1146                                 "org.freedesktop.machine1",
1147                                 "/org/freedesktop/machine1",
1148                                 "org.freedesktop.machine1.Manager",
1149                                 "CreateMachine",
1150                                 &error,
1151                                 NULL,
1152                                 "sayssusa(sv)",
1153                                 arg_machine,
1154                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1155                                 "nspawn",
1156                                 "container",
1157                                 (uint32_t) pid,
1158                                 strempty(arg_directory),
1159                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1160         }
1161
1162         if (r < 0) {
1163                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1164                 return r;
1165         }
1166
1167         return 0;
1168 }
1169
1170 static int terminate_machine(pid_t pid) {
1171         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1172         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1173         _cleanup_bus_unref_ sd_bus *bus = NULL;
1174         const char *path;
1175         int r;
1176
1177         if (!arg_register)
1178                 return 0;
1179
1180         r = sd_bus_default_system(&bus);
1181         if (r < 0) {
1182                 log_error("Failed to open system bus: %s", strerror(-r));
1183                 return r;
1184         }
1185
1186         r = sd_bus_call_method(
1187                         bus,
1188                         "org.freedesktop.machine1",
1189                         "/org/freedesktop/machine1",
1190                         "org.freedesktop.machine1.Manager",
1191                         "GetMachineByPID",
1192                         &error,
1193                         &reply,
1194                         "u",
1195                         (uint32_t) pid);
1196         if (r < 0) {
1197                 /* Note that the machine might already have been
1198                  * cleaned up automatically, hence don't consider it a
1199                  * failure if we cannot get the machine object. */
1200                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1201                 return 0;
1202         }
1203
1204         r = sd_bus_message_read(reply, "o", &path);
1205         if (r < 0)
1206                 return bus_log_parse_error(r);
1207
1208         r = sd_bus_call_method(
1209                         bus,
1210                         "org.freedesktop.machine1",
1211                         path,
1212                         "org.freedesktop.machine1.Machine",
1213                         "Terminate",
1214                         &error,
1215                         NULL,
1216                         NULL);
1217         if (r < 0) {
1218                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1219                 return 0;
1220         }
1221
1222         return 0;
1223 }
1224
1225 static int reset_audit_loginuid(void) {
1226         _cleanup_free_ char *p = NULL;
1227         int r;
1228
1229         if (arg_share_system)
1230                 return 0;
1231
1232         r = read_one_line_file("/proc/self/loginuid", &p);
1233         if (r == -EEXIST)
1234                 return 0;
1235         if (r < 0) {
1236                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1237                 return r;
1238         }
1239
1240         /* Already reset? */
1241         if (streq(p, "4294967295"))
1242                 return 0;
1243
1244         r = write_string_file("/proc/self/loginuid", "4294967295");
1245         if (r < 0) {
1246                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1247                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1248                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1249                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1250                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1251
1252                 sleep(5);
1253         }
1254
1255         return 0;
1256 }
1257
1258 static int move_network_interfaces(pid_t pid) {
1259         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1260         _cleanup_udev_unref_ struct udev *udev = NULL;
1261         char **i;
1262         int r;
1263
1264         if (!arg_private_network)
1265                 return 0;
1266
1267         if (strv_isempty(arg_network_interfaces))
1268                 return 0;
1269
1270         r = sd_rtnl_open(0, &rtnl);
1271         if (r < 0) {
1272                 log_error("Failed to connect to netlink: %s", strerror(-r));
1273                 return r;
1274         }
1275
1276         udev = udev_new();
1277         if (!udev) {
1278                 log_error("Failed to connect to udev.");
1279                 return -ENOMEM;
1280         }
1281
1282         STRV_FOREACH(i, arg_network_interfaces) {
1283                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1284                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1285                 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1286                 int ifi;
1287
1288                 ifi = (int) if_nametoindex(*i);
1289                 if (ifi <= 0) {
1290                         log_error("Failed to resolve interface %s: %m", *i);
1291                         return -errno;
1292                 }
1293
1294                 sprintf(ifi_str, "n%i", ifi);
1295                 d = udev_device_new_from_device_id(udev, ifi_str);
1296                 if (!d) {
1297                         log_error("Failed to get udev device for interface %s: %m", *i);
1298                         return -errno;
1299                 }
1300
1301                 if (udev_device_get_is_initialized(d) <= 0) {
1302                         log_error("Network interface %s is not initialized yet.", *i);
1303                         return -EBUSY;
1304                 }
1305
1306                 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1307                 if (r < 0) {
1308                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1309                         return r;
1310                 }
1311
1312                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1313                 if (r < 0) {
1314                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1315                         return r;
1316                 }
1317
1318                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1319                 if (r < 0) {
1320                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1321                         return r;
1322                 }
1323         }
1324
1325         return 0;
1326 }
1327
1328 int main(int argc, char *argv[]) {
1329         pid_t pid = 0;
1330         int r = EXIT_FAILURE, k;
1331         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1332         int n_fd_passed;
1333         const char *console = NULL;
1334         sigset_t mask;
1335         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1336         _cleanup_fdset_free_ FDSet *fds = NULL;
1337         _cleanup_free_ char *kdbus_domain = NULL;
1338
1339         log_parse_environment();
1340         log_open();
1341
1342         k = parse_argv(argc, argv);
1343         if (k < 0)
1344                 goto finish;
1345         else if (k == 0) {
1346                 r = EXIT_SUCCESS;
1347                 goto finish;
1348         }
1349
1350         if (arg_directory) {
1351                 char *p;
1352
1353                 p = path_make_absolute_cwd(arg_directory);
1354                 free(arg_directory);
1355                 arg_directory = p;
1356         } else
1357                 arg_directory = get_current_dir_name();
1358
1359         if (!arg_directory) {
1360                 log_error("Failed to determine path, please use -D.");
1361                 goto finish;
1362         }
1363
1364         path_kill_slashes(arg_directory);
1365
1366         if (!arg_machine) {
1367                 arg_machine = strdup(basename(arg_directory));
1368                 if (!arg_machine) {
1369                         log_oom();
1370                         goto finish;
1371                 }
1372
1373                 hostname_cleanup(arg_machine, false);
1374                 if (isempty(arg_machine)) {
1375                         log_error("Failed to determine machine name automatically, please use -M.");
1376                         goto finish;
1377                 }
1378         }
1379
1380         if (geteuid() != 0) {
1381                 log_error("Need to be root.");
1382                 goto finish;
1383         }
1384
1385         if (sd_booted() <= 0) {
1386                 log_error("Not running on a systemd system.");
1387                 goto finish;
1388         }
1389
1390         if (path_equal(arg_directory, "/")) {
1391                 log_error("Spawning container on root directory not supported.");
1392                 goto finish;
1393         }
1394
1395         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1396                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1397                 goto finish;
1398         }
1399
1400         log_close();
1401         n_fd_passed = sd_listen_fds(false);
1402         if (n_fd_passed > 0) {
1403                 k = fdset_new_listen_fds(&fds, false);
1404                 if (k < 0) {
1405                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1406                         goto finish;
1407                 }
1408         }
1409         fdset_close_others(fds);
1410         log_open();
1411
1412         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1413         if (master < 0) {
1414                 log_error("Failed to acquire pseudo tty: %m");
1415                 goto finish;
1416         }
1417
1418         console = ptsname(master);
1419         if (!console) {
1420                 log_error("Failed to determine tty name: %m");
1421                 goto finish;
1422         }
1423
1424         if (!arg_quiet)
1425                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1426
1427         if (unlockpt(master) < 0) {
1428                 log_error("Failed to unlock tty: %m");
1429                 goto finish;
1430         }
1431
1432
1433         if (access("/dev/kdbus/control", F_OK) >= 0) {
1434
1435                 if (arg_share_system) {
1436                         kdbus_domain = strdup("/dev/kdbus");
1437                         if (!kdbus_domain) {
1438                                 log_oom();
1439                                 goto finish;
1440                         }
1441                 } else {
1442                         const char *ns;
1443
1444                         ns = strappenda("machine-", arg_machine);
1445                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1446                         if (r < 0)
1447                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1448                         else
1449                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1450                 }
1451         }
1452
1453         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1454                 log_error("Failed to create kmsg socket pair: %m");
1455                 goto finish;
1456         }
1457
1458         sd_notify(0, "READY=1");
1459
1460         assert_se(sigemptyset(&mask) == 0);
1461         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1462         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1463
1464         for (;;) {
1465                 siginfo_t status;
1466
1467                 sync_fd = eventfd(0, EFD_CLOEXEC);
1468                 if (sync_fd < 0) {
1469                         log_error("Failed to create event fd: %m");
1470                         goto finish;
1471                 }
1472
1473                 pid = syscall(__NR_clone,
1474                               SIGCHLD|CLONE_NEWNS|
1475                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1476                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1477                 if (pid < 0) {
1478                         if (errno == EINVAL)
1479                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1480                         else
1481                                 log_error("clone() failed: %m");
1482
1483                         goto finish;
1484                 }
1485
1486                 if (pid == 0) {
1487                         /* child */
1488                         const char *home = NULL;
1489                         uid_t uid = (uid_t) -1;
1490                         gid_t gid = (gid_t) -1;
1491                         unsigned n_env = 2;
1492                         const char *envp[] = {
1493                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1494                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1495                                 NULL, /* TERM */
1496                                 NULL, /* HOME */
1497                                 NULL, /* USER */
1498                                 NULL, /* LOGNAME */
1499                                 NULL, /* container_uuid */
1500                                 NULL, /* LISTEN_FDS */
1501                                 NULL, /* LISTEN_PID */
1502                                 NULL
1503                         };
1504                         char **env_use;
1505                         eventfd_t x;
1506
1507                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1508                         if (envp[n_env])
1509                                 n_env ++;
1510
1511                         close_nointr_nofail(master);
1512                         master = -1;
1513
1514                         close_nointr(STDIN_FILENO);
1515                         close_nointr(STDOUT_FILENO);
1516                         close_nointr(STDERR_FILENO);
1517
1518                         close_nointr_nofail(kmsg_socket_pair[0]);
1519                         kmsg_socket_pair[0] = -1;
1520
1521                         reset_all_signal_handlers();
1522
1523                         assert_se(sigemptyset(&mask) == 0);
1524                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1525
1526                         k = open_terminal(console, O_RDWR);
1527                         if (k != STDIN_FILENO) {
1528                                 if (k >= 0) {
1529                                         close_nointr_nofail(k);
1530                                         k = -EINVAL;
1531                                 }
1532
1533                                 log_error("Failed to open console: %s", strerror(-k));
1534                                 goto child_fail;
1535                         }
1536
1537                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1538                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1539                                 log_error("Failed to duplicate console: %m");
1540                                 goto child_fail;
1541                         }
1542
1543                         if (setsid() < 0) {
1544                                 log_error("setsid() failed: %m");
1545                                 goto child_fail;
1546                         }
1547
1548                         if (reset_audit_loginuid() < 0)
1549                                 goto child_fail;
1550
1551                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1552                                 log_error("PR_SET_PDEATHSIG failed: %m");
1553                                 goto child_fail;
1554                         }
1555
1556                         /* Mark everything as slave, so that we still
1557                          * receive mounts from the real root, but don't
1558                          * propagate mounts to the real root. */
1559                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1560                                 log_error("MS_SLAVE|MS_REC failed: %m");
1561                                 goto child_fail;
1562                         }
1563
1564                         /* Turn directory into bind mount */
1565                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1566                                 log_error("Failed to make bind mount.");
1567                                 goto child_fail;
1568                         }
1569
1570                         if (arg_read_only)
1571                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1572                                         log_error("Failed to make read-only.");
1573                                         goto child_fail;
1574                                 }
1575
1576                         if (mount_all(arg_directory) < 0)
1577                                 goto child_fail;
1578
1579                         if (copy_devnodes(arg_directory) < 0)
1580                                 goto child_fail;
1581
1582                         if (setup_ptmx(arg_directory) < 0)
1583                                 goto child_fail;
1584
1585                         dev_setup(arg_directory);
1586
1587                         if (setup_dev_console(arg_directory, console) < 0)
1588                                 goto child_fail;
1589
1590                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1591                                 goto child_fail;
1592
1593                         close_nointr_nofail(kmsg_socket_pair[1]);
1594                         kmsg_socket_pair[1] = -1;
1595
1596                         if (setup_boot_id(arg_directory) < 0)
1597                                 goto child_fail;
1598
1599                         if (setup_timezone(arg_directory) < 0)
1600                                 goto child_fail;
1601
1602                         if (setup_resolv_conf(arg_directory) < 0)
1603                                 goto child_fail;
1604
1605                         if (setup_journal(arg_directory) < 0)
1606                                 goto child_fail;
1607
1608                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1609                                 goto child_fail;
1610
1611                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1612                                 goto child_fail;
1613
1614                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1615                                 goto child_fail;
1616
1617                         if (chdir(arg_directory) < 0) {
1618                                 log_error("chdir(%s) failed: %m", arg_directory);
1619                                 goto child_fail;
1620                         }
1621
1622                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1623                                 log_error("mount(MS_MOVE) failed: %m");
1624                                 goto child_fail;
1625                         }
1626
1627                         if (chroot(".") < 0) {
1628                                 log_error("chroot() failed: %m");
1629                                 goto child_fail;
1630                         }
1631
1632                         if (chdir("/") < 0) {
1633                                 log_error("chdir() failed: %m");
1634                                 goto child_fail;
1635                         }
1636
1637                         umask(0022);
1638
1639                         if (arg_private_network)
1640                                 loopback_setup();
1641
1642                         if (drop_capabilities() < 0) {
1643                                 log_error("drop_capabilities() failed: %m");
1644                                 goto child_fail;
1645                         }
1646
1647                         if (arg_user) {
1648
1649                                 /* Note that this resolves user names
1650                                  * inside the container, and hence
1651                                  * accesses the NSS modules from the
1652                                  * container and not the host. This is
1653                                  * a bit weird... */
1654
1655                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1656                                         log_error("get_user_creds() failed: %m");
1657                                         goto child_fail;
1658                                 }
1659
1660                                 if (mkdir_parents_label(home, 0775) < 0) {
1661                                         log_error("mkdir_parents_label() failed: %m");
1662                                         goto child_fail;
1663                                 }
1664
1665                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1666                                         log_error("mkdir_safe_label() failed: %m");
1667                                         goto child_fail;
1668                                 }
1669
1670                                 if (initgroups((const char*)arg_user, gid) < 0) {
1671                                         log_error("initgroups() failed: %m");
1672                                         goto child_fail;
1673                                 }
1674
1675                                 if (setresgid(gid, gid, gid) < 0) {
1676                                         log_error("setregid() failed: %m");
1677                                         goto child_fail;
1678                                 }
1679
1680                                 if (setresuid(uid, uid, uid) < 0) {
1681                                         log_error("setreuid() failed: %m");
1682                                         goto child_fail;
1683                                 }
1684                         } else {
1685                                 /* Reset everything fully to 0, just in case */
1686
1687                                 if (setgroups(0, NULL) < 0) {
1688                                         log_error("setgroups() failed: %m");
1689                                         goto child_fail;
1690                                 }
1691
1692                                 if (setresgid(0, 0, 0) < 0) {
1693                                         log_error("setregid() failed: %m");
1694                                         goto child_fail;
1695                                 }
1696
1697                                 if (setresuid(0, 0, 0) < 0) {
1698                                         log_error("setreuid() failed: %m");
1699                                         goto child_fail;
1700                                 }
1701                         }
1702
1703                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1704                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1705                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1706                                 log_oom();
1707                                 goto child_fail;
1708                         }
1709
1710                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1711                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1712                                         log_oom();
1713                                         goto child_fail;
1714                                 }
1715                         }
1716
1717                         if (fdset_size(fds) > 0) {
1718                                 k = fdset_cloexec(fds, false);
1719                                 if (k < 0) {
1720                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1721                                         goto child_fail;
1722                                 }
1723
1724                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1725                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1726                                         log_oom();
1727                                         goto child_fail;
1728                                 }
1729                         }
1730
1731                         setup_hostname();
1732
1733                         eventfd_read(sync_fd, &x);
1734                         close_nointr_nofail(sync_fd);
1735                         sync_fd = -1;
1736
1737                         if (!strv_isempty(arg_setenv)) {
1738                                 char **n;
1739
1740                                 n = strv_env_merge(2, envp, arg_setenv);
1741                                 if (!n) {
1742                                         log_oom();
1743                                         goto child_fail;
1744                                 }
1745
1746                                 env_use = n;
1747                         } else
1748                                 env_use = (char**) envp;
1749
1750 #ifdef HAVE_SELINUX
1751                         if (arg_selinux_context)
1752                                 if (setexeccon(arg_selinux_context) < 0)
1753                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1754 #endif
1755                         if (arg_boot) {
1756                                 char **a;
1757                                 size_t l;
1758
1759                                 /* Automatically search for the init system */
1760
1761                                 l = 1 + argc - optind;
1762                                 a = newa(char*, l + 1);
1763                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1764
1765                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1766                                 execve(a[0], a, env_use);
1767
1768                                 a[0] = (char*) "/lib/systemd/systemd";
1769                                 execve(a[0], a, env_use);
1770
1771                                 a[0] = (char*) "/sbin/init";
1772                                 execve(a[0], a, env_use);
1773                         } else if (argc > optind)
1774                                 execvpe(argv[optind], argv + optind, env_use);
1775                         else {
1776                                 chdir(home ? home : "/root");
1777                                 execle("/bin/bash", "-bash", NULL, env_use);
1778                         }
1779
1780                         log_error("execv() failed: %m");
1781
1782                 child_fail:
1783                         _exit(EXIT_FAILURE);
1784                 }
1785
1786                 fdset_free(fds);
1787                 fds = NULL;
1788
1789                 r = register_machine(pid);
1790                 if (r < 0)
1791                         goto finish;
1792
1793                 r = move_network_interfaces(pid);
1794                 if (r < 0)
1795                         goto finish;
1796
1797                 eventfd_write(sync_fd, 1);
1798                 close_nointr_nofail(sync_fd);
1799                 sync_fd = -1;
1800
1801                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1802                 if (k < 0) {
1803                         r = EXIT_FAILURE;
1804                         break;
1805                 }
1806
1807                 if (!arg_quiet)
1808                         putc('\n', stdout);
1809
1810                 /* Kill if it is not dead yet anyway */
1811                 terminate_machine(pid);
1812
1813                 /* Redundant, but better safe than sorry */
1814                 kill(pid, SIGKILL);
1815
1816                 k = wait_for_terminate(pid, &status);
1817                 pid = 0;
1818
1819                 if (k < 0) {
1820                         r = EXIT_FAILURE;
1821                         break;
1822                 }
1823
1824                 if (status.si_code == CLD_EXITED) {
1825                         r = status.si_status;
1826                         if (status.si_status != 0) {
1827                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1828                                 break;
1829                         }
1830
1831                         if (!arg_quiet)
1832                                 log_debug("Container %s exited successfully.", arg_machine);
1833                         break;
1834                 } else if (status.si_code == CLD_KILLED &&
1835                            status.si_status == SIGINT) {
1836
1837                         if (!arg_quiet)
1838                                 log_info("Container %s has been shut down.", arg_machine);
1839                         r = 0;
1840                         break;
1841                 } else if (status.si_code == CLD_KILLED &&
1842                            status.si_status == SIGHUP) {
1843
1844                         if (!arg_quiet)
1845                                 log_info("Container %s is being rebooted.", arg_machine);
1846                         continue;
1847                 } else if (status.si_code == CLD_KILLED ||
1848                            status.si_code == CLD_DUMPED) {
1849
1850                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1851                         r = EXIT_FAILURE;
1852                         break;
1853                 } else {
1854                         log_error("Container %s failed due to unknown reason.", arg_machine);
1855                         r = EXIT_FAILURE;
1856                         break;
1857                 }
1858         }
1859
1860 finish:
1861         if (pid > 0)
1862                 kill(pid, SIGKILL);
1863
1864         free(arg_directory);
1865         free(arg_machine);
1866         free(arg_setenv);
1867         free(arg_network_interfaces);
1868
1869         return r;
1870 }