chiark / gitweb /
b4c5a549443b2036335d12ebcf4be09a2e459c28
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <linux/rtnetlink.h>
44 #include <sys/eventfd.h>
45 #include <net/if.h>
46 #include <linux/veth.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #include "sd-daemon.h"
53 #include "sd-bus.h"
54 #include "sd-id128.h"
55 #include "sd-rtnl.h"
56 #include "log.h"
57 #include "util.h"
58 #include "mkdir.h"
59 #include "macro.h"
60 #include "audit.h"
61 #include "missing.h"
62 #include "cgroup-util.h"
63 #include "strv.h"
64 #include "path-util.h"
65 #include "loopback-setup.h"
66 #include "dev-setup.h"
67 #include "fdset.h"
68 #include "build.h"
69 #include "fileio.h"
70 #include "bus-util.h"
71 #include "bus-error.h"
72 #include "ptyfwd.h"
73 #include "bus-kernel.h"
74 #include "env-util.h"
75 #include "def.h"
76 #include "rtnl-util.h"
77 #include "udev-util.h"
78
79 typedef enum LinkJournal {
80         LINK_NO,
81         LINK_AUTO,
82         LINK_HOST,
83         LINK_GUEST
84 } LinkJournal;
85
86 static char *arg_directory = NULL;
87 static char *arg_user = NULL;
88 static sd_id128_t arg_uuid = {};
89 static char *arg_machine = NULL;
90 static char *arg_selinux_context = NULL;
91 static char *arg_selinux_apifs_context = NULL;
92 static const char *arg_slice = NULL;
93 static bool arg_private_network = false;
94 static bool arg_read_only = false;
95 static bool arg_boot = false;
96 static LinkJournal arg_link_journal = LINK_AUTO;
97 static uint64_t arg_retain =
98         (1ULL << CAP_CHOWN) |
99         (1ULL << CAP_DAC_OVERRIDE) |
100         (1ULL << CAP_DAC_READ_SEARCH) |
101         (1ULL << CAP_FOWNER) |
102         (1ULL << CAP_FSETID) |
103         (1ULL << CAP_IPC_OWNER) |
104         (1ULL << CAP_KILL) |
105         (1ULL << CAP_LEASE) |
106         (1ULL << CAP_LINUX_IMMUTABLE) |
107         (1ULL << CAP_NET_BIND_SERVICE) |
108         (1ULL << CAP_NET_BROADCAST) |
109         (1ULL << CAP_NET_RAW) |
110         (1ULL << CAP_SETGID) |
111         (1ULL << CAP_SETFCAP) |
112         (1ULL << CAP_SETPCAP) |
113         (1ULL << CAP_SETUID) |
114         (1ULL << CAP_SYS_ADMIN) |
115         (1ULL << CAP_SYS_CHROOT) |
116         (1ULL << CAP_SYS_NICE) |
117         (1ULL << CAP_SYS_PTRACE) |
118         (1ULL << CAP_SYS_TTY_CONFIG) |
119         (1ULL << CAP_SYS_RESOURCE) |
120         (1ULL << CAP_SYS_BOOT) |
121         (1ULL << CAP_AUDIT_WRITE) |
122         (1ULL << CAP_AUDIT_CONTROL) |
123         (1ULL << CAP_MKNOD);
124 static char **arg_bind = NULL;
125 static char **arg_bind_ro = NULL;
126 static char **arg_setenv = NULL;
127 static bool arg_quiet = false;
128 static bool arg_share_system = false;
129 static bool arg_register = true;
130 static bool arg_keep_unit = false;
131 static char **arg_network_interfaces = NULL;
132 static bool arg_network_veth = false;
133
134 static int help(void) {
135
136         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
137                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
138                "  -h --help                 Show this help\n"
139                "     --version              Print version string\n"
140                "  -q --quiet                Do not show status information\n"
141                "  -D --directory=NAME       Root directory for the container\n"
142                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
143                "  -u --user=USER            Run the command under specified user or uid\n"
144                "  -M --machine=NAME         Set the machine name for the container\n"
145                "     --uuid=UUID            Set a specific machine UUID for the container\n"
146                "  -S --slice=SLICE          Place the container in the specified slice\n"
147                "     --private-network      Disable network in container\n"
148                "     --network-interface=INTERFACE\n"
149                "                            Assign an existing network interface to the\n"
150                "                            container\n"
151                "     --network-veth         Add a a virtual ethernet connection between host\n"
152                "                            and container\n"
153                "  -Z --selinux-context=SECLABEL\n"
154                "                            Set the SELinux security context to be used by\n"
155                "                            processes in the container\n"
156                "  -L --selinux-apifs-context=SECLABEL\n"
157                "                            Set the SELinux security context to be used by\n"
158                "                            API/tmpfs file systems in the container\n"
159                "     --capability=CAP       In addition to the default, retain specified\n"
160                "                            capability\n"
161                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
162                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
163                "  -j                        Equivalent to --link-journal=host\n"
164                "     --read-only            Mount the root directory read-only\n"
165                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
166                "                            the container\n"
167                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
168                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
169                "     --share-system         Share system namespaces with host\n"
170                "     --register=BOOLEAN     Register container as machine\n"
171                "     --keep-unit            Do not register a scope for the machine, reuse\n"
172                "                            the service unit nspawn is running in\n",
173                program_invocation_short_name);
174
175         return 0;
176 }
177
178 static int parse_argv(int argc, char *argv[]) {
179
180         enum {
181                 ARG_VERSION = 0x100,
182                 ARG_PRIVATE_NETWORK,
183                 ARG_UUID,
184                 ARG_READ_ONLY,
185                 ARG_CAPABILITY,
186                 ARG_DROP_CAPABILITY,
187                 ARG_LINK_JOURNAL,
188                 ARG_BIND,
189                 ARG_BIND_RO,
190                 ARG_SETENV,
191                 ARG_SHARE_SYSTEM,
192                 ARG_REGISTER,
193                 ARG_KEEP_UNIT,
194                 ARG_NETWORK_INTERFACE,
195                 ARG_NETWORK_VETH,
196         };
197
198         static const struct option options[] = {
199                 { "help",                  no_argument,       NULL, 'h'                   },
200                 { "version",               no_argument,       NULL, ARG_VERSION           },
201                 { "directory",             required_argument, NULL, 'D'                   },
202                 { "user",                  required_argument, NULL, 'u'                   },
203                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
204                 { "boot",                  no_argument,       NULL, 'b'                   },
205                 { "uuid",                  required_argument, NULL, ARG_UUID              },
206                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
207                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
208                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
209                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
210                 { "bind",                  required_argument, NULL, ARG_BIND              },
211                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
212                 { "machine",               required_argument, NULL, 'M'                   },
213                 { "slice",                 required_argument, NULL, 'S'                   },
214                 { "setenv",                required_argument, NULL, ARG_SETENV            },
215                 { "selinux-context",       required_argument, NULL, 'Z'                   },
216                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
217                 { "quiet",                 no_argument,       NULL, 'q'                   },
218                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
219                 { "register",              required_argument, NULL, ARG_REGISTER          },
220                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
221                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
222                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH   },
223                 {}
224         };
225
226         int c, r;
227         uint64_t plus = 0, minus = 0;
228
229         assert(argc >= 0);
230         assert(argv);
231
232         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
233
234                 switch (c) {
235
236                 case 'h':
237                         return help();
238
239                 case ARG_VERSION:
240                         puts(PACKAGE_STRING);
241                         puts(SYSTEMD_FEATURES);
242                         return 0;
243
244                 case 'D':
245                         free(arg_directory);
246                         arg_directory = canonicalize_file_name(optarg);
247                         if (!arg_directory) {
248                                 log_error("Invalid root directory: %m");
249                                 return -ENOMEM;
250                         }
251
252                         break;
253
254                 case 'u':
255                         free(arg_user);
256                         arg_user = strdup(optarg);
257                         if (!arg_user)
258                                 return log_oom();
259
260                         break;
261
262                 case ARG_NETWORK_VETH:
263                         arg_network_veth = true;
264                         arg_private_network = true;
265                         break;
266
267                 case ARG_NETWORK_INTERFACE:
268                         if (strv_push(&arg_network_interfaces, optarg) < 0)
269                                 return log_oom();
270
271                         /* fall through */
272
273                 case ARG_PRIVATE_NETWORK:
274                         arg_private_network = true;
275                         break;
276
277                 case 'b':
278                         arg_boot = true;
279                         break;
280
281                 case ARG_UUID:
282                         r = sd_id128_from_string(optarg, &arg_uuid);
283                         if (r < 0) {
284                                 log_error("Invalid UUID: %s", optarg);
285                                 return r;
286                         }
287                         break;
288
289                 case 'S':
290                         arg_slice = strdup(optarg);
291                         if (!arg_slice)
292                                 return log_oom();
293
294                         break;
295
296                 case 'M':
297                         if (isempty(optarg)) {
298                                 free(arg_machine);
299                                 arg_machine = NULL;
300                         } else {
301
302                                 if (!hostname_is_valid(optarg)) {
303                                         log_error("Invalid machine name: %s", optarg);
304                                         return -EINVAL;
305                                 }
306
307                                 free(arg_machine);
308                                 arg_machine = strdup(optarg);
309                                 if (!arg_machine)
310                                         return log_oom();
311
312                                 break;
313                         }
314
315                 case 'Z':
316                         arg_selinux_context = optarg;
317                         break;
318
319                 case 'L':
320                         arg_selinux_apifs_context = optarg;
321                         break;
322
323                 case ARG_READ_ONLY:
324                         arg_read_only = true;
325                         break;
326
327                 case ARG_CAPABILITY:
328                 case ARG_DROP_CAPABILITY: {
329                         char *state, *word;
330                         size_t length;
331
332                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
333                                 _cleanup_free_ char *t;
334                                 cap_value_t cap;
335
336                                 t = strndup(word, length);
337                                 if (!t)
338                                         return log_oom();
339
340                                 if (streq(t, "all")) {
341                                         if (c == ARG_CAPABILITY)
342                                                 plus = (uint64_t) -1;
343                                         else
344                                                 minus = (uint64_t) -1;
345                                 } else {
346                                         if (cap_from_name(t, &cap) < 0) {
347                                                 log_error("Failed to parse capability %s.", t);
348                                                 return -EINVAL;
349                                         }
350
351                                         if (c == ARG_CAPABILITY)
352                                                 plus |= 1ULL << (uint64_t) cap;
353                                         else
354                                                 minus |= 1ULL << (uint64_t) cap;
355                                 }
356                         }
357
358                         break;
359                 }
360
361                 case 'j':
362                         arg_link_journal = LINK_GUEST;
363                         break;
364
365                 case ARG_LINK_JOURNAL:
366                         if (streq(optarg, "auto"))
367                                 arg_link_journal = LINK_AUTO;
368                         else if (streq(optarg, "no"))
369                                 arg_link_journal = LINK_NO;
370                         else if (streq(optarg, "guest"))
371                                 arg_link_journal = LINK_GUEST;
372                         else if (streq(optarg, "host"))
373                                 arg_link_journal = LINK_HOST;
374                         else {
375                                 log_error("Failed to parse link journal mode %s", optarg);
376                                 return -EINVAL;
377                         }
378
379                         break;
380
381                 case ARG_BIND:
382                 case ARG_BIND_RO: {
383                         _cleanup_free_ char *a = NULL, *b = NULL;
384                         char *e;
385                         char ***x;
386
387                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
388
389                         e = strchr(optarg, ':');
390                         if (e) {
391                                 a = strndup(optarg, e - optarg);
392                                 b = strdup(e + 1);
393                         } else {
394                                 a = strdup(optarg);
395                                 b = strdup(optarg);
396                         }
397
398                         if (!a || !b)
399                                 return log_oom();
400
401                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
402                                 log_error("Invalid bind mount specification: %s", optarg);
403                                 return -EINVAL;
404                         }
405
406                         r = strv_extend(x, a);
407                         if (r < 0)
408                                 return log_oom();
409
410                         r = strv_extend(x, b);
411                         if (r < 0)
412                                 return log_oom();
413
414                         break;
415                 }
416
417                 case ARG_SETENV: {
418                         char **n;
419
420                         if (!env_assignment_is_valid(optarg)) {
421                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
422                                 return -EINVAL;
423                         }
424
425                         n = strv_env_set(arg_setenv, optarg);
426                         if (!n)
427                                 return log_oom();
428
429                         strv_free(arg_setenv);
430                         arg_setenv = n;
431                         break;
432                 }
433
434                 case 'q':
435                         arg_quiet = true;
436                         break;
437
438                 case ARG_SHARE_SYSTEM:
439                         arg_share_system = true;
440                         break;
441
442                 case ARG_REGISTER:
443                         r = parse_boolean(optarg);
444                         if (r < 0) {
445                                 log_error("Failed to parse --register= argument: %s", optarg);
446                                 return r;
447                         }
448
449                         arg_register = r;
450                         break;
451
452                 case ARG_KEEP_UNIT:
453                         arg_keep_unit = true;
454                         break;
455
456                 case '?':
457                         return -EINVAL;
458
459                 default:
460                         assert_not_reached("Unhandled option");
461                 }
462         }
463
464         if (arg_share_system)
465                 arg_register = false;
466
467         if (arg_boot && arg_share_system) {
468                 log_error("--boot and --share-system may not be combined.");
469                 return -EINVAL;
470         }
471
472         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
473                 log_error("--keep-unit may not be used when invoked from a user session.");
474                 return -EINVAL;
475         }
476
477         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
478
479         return 1;
480 }
481
482 static int mount_all(const char *dest) {
483
484         typedef struct MountPoint {
485                 const char *what;
486                 const char *where;
487                 const char *type;
488                 const char *options;
489                 unsigned long flags;
490                 bool fatal;
491         } MountPoint;
492
493         static const MountPoint mount_table[] = {
494                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
495                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
496                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
497                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
498                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
499                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
500                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
501                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
502 #ifdef HAVE_SELINUX
503                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
504                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
505 #endif
506         };
507
508         unsigned k;
509         int r = 0;
510
511         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
512                 _cleanup_free_ char *where = NULL;
513 #ifdef HAVE_SELINUX
514                 _cleanup_free_ char *options = NULL;
515 #endif
516                 const char *o;
517                 int t;
518
519                 where = strjoin(dest, "/", mount_table[k].where, NULL);
520                 if (!where)
521                         return log_oom();
522
523                 t = path_is_mount_point(where, true);
524                 if (t < 0) {
525                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
526
527                         if (r == 0)
528                                 r = t;
529
530                         continue;
531                 }
532
533                 /* Skip this entry if it is not a remount. */
534                 if (mount_table[k].what && t > 0)
535                         continue;
536
537                 mkdir_p(where, 0755);
538
539 #ifdef HAVE_SELINUX
540                 if (arg_selinux_apifs_context &&
541                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
542                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
543                         if (!options)
544                                 return log_oom();
545
546                         o = options;
547                 } else
548 #endif
549                         o = mount_table[k].options;
550
551
552                 if (mount(mount_table[k].what,
553                           where,
554                           mount_table[k].type,
555                           mount_table[k].flags,
556                           o) < 0 &&
557                     mount_table[k].fatal) {
558
559                         log_error("mount(%s) failed: %m", where);
560
561                         if (r == 0)
562                                 r = -errno;
563                 }
564         }
565
566         return r;
567 }
568
569 static int mount_binds(const char *dest, char **l, unsigned long flags) {
570         char **x, **y;
571
572         STRV_FOREACH_PAIR(x, y, l) {
573                 char *where;
574                 struct stat source_st, dest_st;
575                 int r;
576
577                 if (stat(*x, &source_st) < 0) {
578                         log_error("failed to stat %s: %m", *x);
579                         return -errno;
580                 }
581
582                 where = strappenda(dest, *y);
583                 r = stat(where, &dest_st);
584                 if (r == 0) {
585                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
586                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
587                                                 *x, where);
588                                 return -EINVAL;
589                         }
590                 } else if (errno == ENOENT) {
591                         r = mkdir_parents_label(where, 0755);
592                         if (r < 0) {
593                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
594                                 return r;
595                         }
596                 } else {
597                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
598                         return -errno;
599                 }
600                 /* Create the mount point, but be conservative -- refuse to create block
601                 * and char devices. */
602                 if (S_ISDIR(source_st.st_mode))
603                         mkdir_label(where, 0755);
604                 else if (S_ISFIFO(source_st.st_mode))
605                         mkfifo(where, 0644);
606                 else if (S_ISSOCK(source_st.st_mode))
607                         mknod(where, 0644 | S_IFSOCK, 0);
608                 else if (S_ISREG(source_st.st_mode))
609                         touch(where);
610                 else {
611                         log_error("Refusing to create mountpoint for file: %s", *x);
612                         return -ENOTSUP;
613                 }
614
615                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
616                         log_error("mount(%s) failed: %m", where);
617                         return -errno;
618                 }
619
620                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
621                         log_error("mount(%s) failed: %m", where);
622                         return -errno;
623                 }
624         }
625
626         return 0;
627 }
628
629 static int setup_timezone(const char *dest) {
630         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
631         char *z, *y;
632         int r;
633
634         assert(dest);
635
636         /* Fix the timezone, if possible */
637         r = readlink_malloc("/etc/localtime", &p);
638         if (r < 0) {
639                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
640                 return 0;
641         }
642
643         z = path_startswith(p, "../usr/share/zoneinfo/");
644         if (!z)
645                 z = path_startswith(p, "/usr/share/zoneinfo/");
646         if (!z) {
647                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
648                 return 0;
649         }
650
651         where = strappend(dest, "/etc/localtime");
652         if (!where)
653                 return log_oom();
654
655         r = readlink_malloc(where, &q);
656         if (r >= 0) {
657                 y = path_startswith(q, "../usr/share/zoneinfo/");
658                 if (!y)
659                         y = path_startswith(q, "/usr/share/zoneinfo/");
660
661
662                 /* Already pointing to the right place? Then do nothing .. */
663                 if (y && streq(y, z))
664                         return 0;
665         }
666
667         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
668         if (!check)
669                 return log_oom();
670
671         if (access(check, F_OK) < 0) {
672                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
673                 return 0;
674         }
675
676         what = strappend("../usr/share/zoneinfo/", z);
677         if (!what)
678                 return log_oom();
679
680         unlink(where);
681         if (symlink(what, where) < 0) {
682                 log_error("Failed to correct timezone of container: %m");
683                 return 0;
684         }
685
686         return 0;
687 }
688
689 static int setup_resolv_conf(const char *dest) {
690         char _cleanup_free_ *where = NULL;
691
692         assert(dest);
693
694         if (arg_private_network)
695                 return 0;
696
697         /* Fix resolv.conf, if possible */
698         where = strappend(dest, "/etc/resolv.conf");
699         if (!where)
700                 return log_oom();
701
702         /* We don't really care for the results of this really. If it
703          * fails, it fails, but meh... */
704         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
705
706         return 0;
707 }
708
709 static int setup_boot_id(const char *dest) {
710         _cleanup_free_ char *from = NULL, *to = NULL;
711         sd_id128_t rnd;
712         char as_uuid[37];
713         int r;
714
715         assert(dest);
716
717         if (arg_share_system)
718                 return 0;
719
720         /* Generate a new randomized boot ID, so that each boot-up of
721          * the container gets a new one */
722
723         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
724         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
725         if (!from || !to)
726                 return log_oom();
727
728         r = sd_id128_randomize(&rnd);
729         if (r < 0) {
730                 log_error("Failed to generate random boot id: %s", strerror(-r));
731                 return r;
732         }
733
734         snprintf(as_uuid, sizeof(as_uuid),
735                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
736                  SD_ID128_FORMAT_VAL(rnd));
737         char_array_0(as_uuid);
738
739         r = write_string_file(from, as_uuid);
740         if (r < 0) {
741                 log_error("Failed to write boot id: %s", strerror(-r));
742                 return r;
743         }
744
745         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
746                 log_error("Failed to bind mount boot id: %m");
747                 r = -errno;
748         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
749                 log_warning("Failed to make boot id read-only: %m");
750
751         unlink(from);
752         return r;
753 }
754
755 static int copy_devnodes(const char *dest) {
756
757         static const char devnodes[] =
758                 "null\0"
759                 "zero\0"
760                 "full\0"
761                 "random\0"
762                 "urandom\0"
763                 "tty\0";
764
765         const char *d;
766         int r = 0;
767         _cleanup_umask_ mode_t u;
768
769         assert(dest);
770
771         u = umask(0000);
772
773         NULSTR_FOREACH(d, devnodes) {
774                 _cleanup_free_ char *from = NULL, *to = NULL;
775                 struct stat st;
776
777                 from = strappend("/dev/", d);
778                 to = strjoin(dest, "/dev/", d, NULL);
779                 if (!from || !to)
780                         return log_oom();
781
782                 if (stat(from, &st) < 0) {
783
784                         if (errno != ENOENT) {
785                                 log_error("Failed to stat %s: %m", from);
786                                 return -errno;
787                         }
788
789                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
790
791                         log_error("%s is not a char or block device, cannot copy", from);
792                         return -EIO;
793
794                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
795
796                         log_error("mknod(%s) failed: %m", dest);
797                         return  -errno;
798                 }
799         }
800
801         return r;
802 }
803
804 static int setup_ptmx(const char *dest) {
805         _cleanup_free_ char *p = NULL;
806
807         p = strappend(dest, "/dev/ptmx");
808         if (!p)
809                 return log_oom();
810
811         if (symlink("pts/ptmx", p) < 0) {
812                 log_error("Failed to create /dev/ptmx symlink: %m");
813                 return -errno;
814         }
815
816         return 0;
817 }
818
819 static int setup_dev_console(const char *dest, const char *console) {
820         struct stat st;
821         _cleanup_free_ char *to = NULL;
822         int r;
823         _cleanup_umask_ mode_t u;
824
825         assert(dest);
826         assert(console);
827
828         u = umask(0000);
829
830         if (stat(console, &st) < 0) {
831                 log_error("Failed to stat %s: %m", console);
832                 return -errno;
833
834         } else if (!S_ISCHR(st.st_mode)) {
835                 log_error("/dev/console is not a char device");
836                 return -EIO;
837         }
838
839         r = chmod_and_chown(console, 0600, 0, 0);
840         if (r < 0) {
841                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
842                 return r;
843         }
844
845         if (asprintf(&to, "%s/dev/console", dest) < 0)
846                 return log_oom();
847
848         /* We need to bind mount the right tty to /dev/console since
849          * ptys can only exist on pts file systems. To have something
850          * to bind mount things on we create a device node first, that
851          * has the right major/minor (note that the major minor
852          * doesn't actually matter here, since we mount it over
853          * anyway). */
854
855         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
856                 log_error("mknod() for /dev/console failed: %m");
857                 return -errno;
858         }
859
860         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
861                 log_error("Bind mount for /dev/console failed: %m");
862                 return -errno;
863         }
864
865         return 0;
866 }
867
868 static int setup_kmsg(const char *dest, int kmsg_socket) {
869         _cleanup_free_ char *from = NULL, *to = NULL;
870         int r, fd, k;
871         _cleanup_umask_ mode_t u;
872         union {
873                 struct cmsghdr cmsghdr;
874                 uint8_t buf[CMSG_SPACE(sizeof(int))];
875         } control = {};
876         struct msghdr mh = {
877                 .msg_control = &control,
878                 .msg_controllen = sizeof(control),
879         };
880         struct cmsghdr *cmsg;
881
882         assert(dest);
883         assert(kmsg_socket >= 0);
884
885         u = umask(0000);
886
887         /* We create the kmsg FIFO as /dev/kmsg, but immediately
888          * delete it after bind mounting it to /proc/kmsg. While FIFOs
889          * on the reading side behave very similar to /proc/kmsg,
890          * their writing side behaves differently from /dev/kmsg in
891          * that writing blocks when nothing is reading. In order to
892          * avoid any problems with containers deadlocking due to this
893          * we simply make /dev/kmsg unavailable to the container. */
894         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
895             asprintf(&to, "%s/proc/kmsg", dest) < 0)
896                 return log_oom();
897
898         if (mkfifo(from, 0600) < 0) {
899                 log_error("mkfifo() for /dev/kmsg failed: %m");
900                 return -errno;
901         }
902
903         r = chmod_and_chown(from, 0600, 0, 0);
904         if (r < 0) {
905                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
906                 return r;
907         }
908
909         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
910                 log_error("Bind mount for /proc/kmsg failed: %m");
911                 return -errno;
912         }
913
914         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
915         if (fd < 0) {
916                 log_error("Failed to open fifo: %m");
917                 return -errno;
918         }
919
920         cmsg = CMSG_FIRSTHDR(&mh);
921         cmsg->cmsg_level = SOL_SOCKET;
922         cmsg->cmsg_type = SCM_RIGHTS;
923         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
924         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
925
926         mh.msg_controllen = cmsg->cmsg_len;
927
928         /* Store away the fd in the socket, so that it stays open as
929          * long as we run the child */
930         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
931         close_nointr_nofail(fd);
932
933         if (k < 0) {
934                 log_error("Failed to send FIFO fd: %m");
935                 return -errno;
936         }
937
938         /* And now make the FIFO unavailable as /dev/kmsg... */
939         unlink(from);
940         return 0;
941 }
942
943 static int setup_hostname(void) {
944
945         if (arg_share_system)
946                 return 0;
947
948         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
949                 return -errno;
950
951         return 0;
952 }
953
954 static int setup_journal(const char *directory) {
955         sd_id128_t machine_id, this_id;
956         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
957         char *id;
958         int r;
959
960         p = strappend(directory, "/etc/machine-id");
961         if (!p)
962                 return log_oom();
963
964         r = read_one_line_file(p, &b);
965         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
966                 return 0;
967         else if (r < 0) {
968                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
969                 return r;
970         }
971
972         id = strstrip(b);
973         if (isempty(id) && arg_link_journal == LINK_AUTO)
974                 return 0;
975
976         /* Verify validity */
977         r = sd_id128_from_string(id, &machine_id);
978         if (r < 0) {
979                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
980                 return r;
981         }
982
983         r = sd_id128_get_machine(&this_id);
984         if (r < 0) {
985                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
986                 return r;
987         }
988
989         if (sd_id128_equal(machine_id, this_id)) {
990                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
991                          "Host and machine ids are equal (%s): refusing to link journals", id);
992                 if (arg_link_journal == LINK_AUTO)
993                         return 0;
994                 return
995                         -EEXIST;
996         }
997
998         if (arg_link_journal == LINK_NO)
999                 return 0;
1000
1001         free(p);
1002         p = strappend("/var/log/journal/", id);
1003         q = strjoin(directory, "/var/log/journal/", id, NULL);
1004         if (!p || !q)
1005                 return log_oom();
1006
1007         if (path_is_mount_point(p, false) > 0) {
1008                 if (arg_link_journal != LINK_AUTO) {
1009                         log_error("%s: already a mount point, refusing to use for journal", p);
1010                         return -EEXIST;
1011                 }
1012
1013                 return 0;
1014         }
1015
1016         if (path_is_mount_point(q, false) > 0) {
1017                 if (arg_link_journal != LINK_AUTO) {
1018                         log_error("%s: already a mount point, refusing to use for journal", q);
1019                         return -EEXIST;
1020                 }
1021
1022                 return 0;
1023         }
1024
1025         r = readlink_and_make_absolute(p, &d);
1026         if (r >= 0) {
1027                 if ((arg_link_journal == LINK_GUEST ||
1028                      arg_link_journal == LINK_AUTO) &&
1029                     path_equal(d, q)) {
1030
1031                         r = mkdir_p(q, 0755);
1032                         if (r < 0)
1033                                 log_warning("failed to create directory %s: %m", q);
1034                         return 0;
1035                 }
1036
1037                 if (unlink(p) < 0) {
1038                         log_error("Failed to remove symlink %s: %m", p);
1039                         return -errno;
1040                 }
1041         } else if (r == -EINVAL) {
1042
1043                 if (arg_link_journal == LINK_GUEST &&
1044                     rmdir(p) < 0) {
1045
1046                         if (errno == ENOTDIR) {
1047                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1048                                 return r;
1049                         } else {
1050                                 log_error("Failed to remove %s: %m", p);
1051                                 return -errno;
1052                         }
1053                 }
1054         } else if (r != -ENOENT) {
1055                 log_error("readlink(%s) failed: %m", p);
1056                 return r;
1057         }
1058
1059         if (arg_link_journal == LINK_GUEST) {
1060
1061                 if (symlink(q, p) < 0) {
1062                         log_error("Failed to symlink %s to %s: %m", q, p);
1063                         return -errno;
1064                 }
1065
1066                 r = mkdir_p(q, 0755);
1067                 if (r < 0)
1068                         log_warning("failed to create directory %s: %m", q);
1069                 return 0;
1070         }
1071
1072         if (arg_link_journal == LINK_HOST) {
1073                 r = mkdir_p(p, 0755);
1074                 if (r < 0) {
1075                         log_error("Failed to create %s: %m", p);
1076                         return r;
1077                 }
1078
1079         } else if (access(p, F_OK) < 0)
1080                 return 0;
1081
1082         if (dir_is_empty(q) == 0) {
1083                 log_error("%s not empty.", q);
1084                 return -ENOTEMPTY;
1085         }
1086
1087         r = mkdir_p(q, 0755);
1088         if (r < 0) {
1089                 log_error("Failed to create %s: %m", q);
1090                 return r;
1091         }
1092
1093         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1094                 log_error("Failed to bind mount journal from host into guest: %m");
1095                 return -errno;
1096         }
1097
1098         return 0;
1099 }
1100
1101 static int setup_kdbus(const char *dest, const char *path) {
1102         const char *p;
1103
1104         if (!path)
1105                 return 0;
1106
1107         p = strappenda(dest, "/dev/kdbus");
1108         if (mkdir(p, 0755) < 0) {
1109                 log_error("Failed to create kdbus path: %m");
1110                 return  -errno;
1111         }
1112
1113         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1114                 log_error("Failed to mount kdbus domain path: %m");
1115                 return -errno;
1116         }
1117
1118         return 0;
1119 }
1120
1121 static int drop_capabilities(void) {
1122         return capability_bounding_set_drop(~arg_retain, false);
1123 }
1124
1125 static int register_machine(pid_t pid) {
1126         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1127         _cleanup_bus_unref_ sd_bus *bus = NULL;
1128         int r;
1129
1130         if (!arg_register)
1131                 return 0;
1132
1133         r = sd_bus_default_system(&bus);
1134         if (r < 0) {
1135                 log_error("Failed to open system bus: %s", strerror(-r));
1136                 return r;
1137         }
1138
1139         if (arg_keep_unit) {
1140                 r = sd_bus_call_method(
1141                                 bus,
1142                                 "org.freedesktop.machine1",
1143                                 "/org/freedesktop/machine1",
1144                                 "org.freedesktop.machine1.Manager",
1145                                 "RegisterMachine",
1146                                 &error,
1147                                 NULL,
1148                                 "sayssus",
1149                                 arg_machine,
1150                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1151                                 "nspawn",
1152                                 "container",
1153                                 (uint32_t) pid,
1154                                 strempty(arg_directory));
1155         } else {
1156                 r = sd_bus_call_method(
1157                                 bus,
1158                                 "org.freedesktop.machine1",
1159                                 "/org/freedesktop/machine1",
1160                                 "org.freedesktop.machine1.Manager",
1161                                 "CreateMachine",
1162                                 &error,
1163                                 NULL,
1164                                 "sayssusa(sv)",
1165                                 arg_machine,
1166                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1167                                 "nspawn",
1168                                 "container",
1169                                 (uint32_t) pid,
1170                                 strempty(arg_directory),
1171                                 !isempty(arg_slice), "Slice", "s", arg_slice);
1172         }
1173
1174         if (r < 0) {
1175                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1176                 return r;
1177         }
1178
1179         return 0;
1180 }
1181
1182 static int terminate_machine(pid_t pid) {
1183         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1185         _cleanup_bus_unref_ sd_bus *bus = NULL;
1186         const char *path;
1187         int r;
1188
1189         if (!arg_register)
1190                 return 0;
1191
1192         r = sd_bus_default_system(&bus);
1193         if (r < 0) {
1194                 log_error("Failed to open system bus: %s", strerror(-r));
1195                 return r;
1196         }
1197
1198         r = sd_bus_call_method(
1199                         bus,
1200                         "org.freedesktop.machine1",
1201                         "/org/freedesktop/machine1",
1202                         "org.freedesktop.machine1.Manager",
1203                         "GetMachineByPID",
1204                         &error,
1205                         &reply,
1206                         "u",
1207                         (uint32_t) pid);
1208         if (r < 0) {
1209                 /* Note that the machine might already have been
1210                  * cleaned up automatically, hence don't consider it a
1211                  * failure if we cannot get the machine object. */
1212                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1213                 return 0;
1214         }
1215
1216         r = sd_bus_message_read(reply, "o", &path);
1217         if (r < 0)
1218                 return bus_log_parse_error(r);
1219
1220         r = sd_bus_call_method(
1221                         bus,
1222                         "org.freedesktop.machine1",
1223                         path,
1224                         "org.freedesktop.machine1.Machine",
1225                         "Terminate",
1226                         &error,
1227                         NULL,
1228                         NULL);
1229         if (r < 0) {
1230                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1231                 return 0;
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int reset_audit_loginuid(void) {
1238         _cleanup_free_ char *p = NULL;
1239         int r;
1240
1241         if (arg_share_system)
1242                 return 0;
1243
1244         r = read_one_line_file("/proc/self/loginuid", &p);
1245         if (r == -EEXIST)
1246                 return 0;
1247         if (r < 0) {
1248                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1249                 return r;
1250         }
1251
1252         /* Already reset? */
1253         if (streq(p, "4294967295"))
1254                 return 0;
1255
1256         r = write_string_file("/proc/self/loginuid", "4294967295");
1257         if (r < 0) {
1258                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1259                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1260                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1261                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1262                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1263
1264                 sleep(5);
1265         }
1266
1267         return 0;
1268 }
1269
1270 static int setup_veth(int netns_fd) {
1271         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1272         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1273         char iface_name[IFNAMSIZ] = "ve-";
1274         int r;
1275
1276         if (!arg_private_network)
1277                 return 0;
1278
1279         if (!arg_network_veth)
1280                 return 0;
1281
1282         strncpy(iface_name+3, arg_machine, sizeof(iface_name) - 3);
1283
1284         r = sd_rtnl_open(0, &rtnl);
1285         if (r < 0) {
1286                 log_error("Failed to connect to netlink: %s", strerror(-r));
1287                 return r;
1288         }
1289
1290         r = sd_rtnl_message_new_link(RTM_NEWLINK, 0, &m);
1291         if (r < 0) {
1292                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1293                 return r;
1294         }
1295
1296         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1297         if (r < 0) {
1298                 log_error("Failed to append netlink kind: %s", strerror(-r));
1299                 return r;
1300         }
1301
1302         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO, 0);
1303         if (r < 0) {
1304                 log_error("Failed to open netlink container: %s", strerror(-r));
1305                 return r;
1306         }
1307
1308         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1309         if (r < 0) {
1310                 log_error("Failed to append netlink kind: %s", strerror(-r));
1311                 return r;
1312         }
1313
1314         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA, 0);
1315         if (r < 0) {
1316                 log_error("Failed to open netlink container: %s", strerror(-r));
1317                 return r;
1318         }
1319
1320         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER, sizeof(struct ifinfomsg));
1321         if (r < 0) {
1322                 log_error("z Failed to open netlink container: %s", strerror(-r));
1323                 return r;
1324         }
1325
1326         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1327         if (r < 0) {
1328                 log_error("Failed to append netlink kind: %s", strerror(-r));
1329                 return r;
1330         }
1331
1332         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_FD, netns_fd);
1333         if (r < 0) {
1334                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1335                 return r;
1336         }
1337
1338         r = sd_rtnl_message_close_container(m);
1339         if (r < 0) {
1340                 log_error("Failed to close netlink container: %s", strerror(-r));
1341                 return r;
1342         }
1343
1344         r = sd_rtnl_message_close_container(m);
1345         if (r < 0) {
1346                 log_error("Failed to close netlink container: %s", strerror(-r));
1347                 return r;
1348         }
1349
1350         r = sd_rtnl_message_close_container(m);
1351         if (r < 0) {
1352                 log_error("Failed to close netlink container: %s", strerror(-r));
1353                 return r;
1354         }
1355
1356         r = sd_rtnl_call(rtnl, m, 0, NULL);
1357         if (r < 0) {
1358                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1359                 return r;
1360         }
1361
1362         return 0;
1363 }
1364
1365 static int move_network_interfaces(pid_t pid) {
1366         _cleanup_udev_unref_ struct udev *udev = NULL;
1367         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1368         char **i;
1369         int r;
1370
1371         if (!arg_private_network)
1372                 return 0;
1373
1374         if (strv_isempty(arg_network_interfaces))
1375                 return 0;
1376
1377         r = sd_rtnl_open(0, &rtnl);
1378         if (r < 0) {
1379                 log_error("Failed to connect to netlink: %s", strerror(-r));
1380                 return r;
1381         }
1382
1383         udev = udev_new();
1384         if (!udev) {
1385                 log_error("Failed to connect to udev.");
1386                 return -ENOMEM;
1387         }
1388
1389         STRV_FOREACH(i, arg_network_interfaces) {
1390                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1391                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1392                 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1393                 int ifi;
1394
1395                 ifi = (int) if_nametoindex(*i);
1396                 if (ifi <= 0) {
1397                         log_error("Failed to resolve interface %s: %m", *i);
1398                         return -errno;
1399                 }
1400
1401                 sprintf(ifi_str, "n%i", ifi);
1402                 d = udev_device_new_from_device_id(udev, ifi_str);
1403                 if (!d) {
1404                         log_error("Failed to get udev device for interface %s: %m", *i);
1405                         return -errno;
1406                 }
1407
1408                 if (udev_device_get_is_initialized(d) <= 0) {
1409                         log_error("Network interface %s is not initialized yet.", *i);
1410                         return -EBUSY;
1411                 }
1412
1413                 r = sd_rtnl_message_new_link(RTM_NEWLINK, ifi, &m);
1414                 if (r < 0) {
1415                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1416                         return r;
1417                 }
1418
1419                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1420                 if (r < 0) {
1421                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1422                         return r;
1423                 }
1424
1425                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1426                 if (r < 0) {
1427                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1428                         return r;
1429                 }
1430         }
1431
1432         return 0;
1433 }
1434
1435 int main(int argc, char *argv[]) {
1436
1437         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1, netns_fd = -1;
1438         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1439         _cleanup_free_ char *kdbus_domain = NULL;
1440         _cleanup_fdset_free_ FDSet *fds = NULL;
1441         const char *console = NULL;
1442         int r = EXIT_FAILURE, k;
1443         int n_fd_passed;
1444         pid_t pid = 0;
1445         sigset_t mask;
1446
1447         log_parse_environment();
1448         log_open();
1449
1450         k = parse_argv(argc, argv);
1451         if (k < 0)
1452                 goto finish;
1453         else if (k == 0) {
1454                 r = EXIT_SUCCESS;
1455                 goto finish;
1456         }
1457
1458         if (arg_directory) {
1459                 char *p;
1460
1461                 p = path_make_absolute_cwd(arg_directory);
1462                 free(arg_directory);
1463                 arg_directory = p;
1464         } else
1465                 arg_directory = get_current_dir_name();
1466
1467         if (!arg_directory) {
1468                 log_error("Failed to determine path, please use -D.");
1469                 goto finish;
1470         }
1471
1472         path_kill_slashes(arg_directory);
1473
1474         if (!arg_machine) {
1475                 arg_machine = strdup(basename(arg_directory));
1476                 if (!arg_machine) {
1477                         log_oom();
1478                         goto finish;
1479                 }
1480
1481                 hostname_cleanup(arg_machine, false);
1482                 if (isempty(arg_machine)) {
1483                         log_error("Failed to determine machine name automatically, please use -M.");
1484                         goto finish;
1485                 }
1486         }
1487
1488         if (geteuid() != 0) {
1489                 log_error("Need to be root.");
1490                 goto finish;
1491         }
1492
1493         if (sd_booted() <= 0) {
1494                 log_error("Not running on a systemd system.");
1495                 goto finish;
1496         }
1497
1498         if (path_equal(arg_directory, "/")) {
1499                 log_error("Spawning container on root directory not supported.");
1500                 goto finish;
1501         }
1502
1503         if (arg_boot && path_is_os_tree(arg_directory) <= 0) {
1504                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1505                 goto finish;
1506         }
1507
1508         log_close();
1509         n_fd_passed = sd_listen_fds(false);
1510         if (n_fd_passed > 0) {
1511                 k = fdset_new_listen_fds(&fds, false);
1512                 if (k < 0) {
1513                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1514                         goto finish;
1515                 }
1516         }
1517         fdset_close_others(fds);
1518         log_open();
1519
1520         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1521         if (master < 0) {
1522                 log_error("Failed to acquire pseudo tty: %m");
1523                 goto finish;
1524         }
1525
1526         console = ptsname(master);
1527         if (!console) {
1528                 log_error("Failed to determine tty name: %m");
1529                 goto finish;
1530         }
1531
1532         if (!arg_quiet)
1533                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1534
1535         if (unlockpt(master) < 0) {
1536                 log_error("Failed to unlock tty: %m");
1537                 goto finish;
1538         }
1539
1540         if (arg_network_veth) {
1541                 netns_fd = open("/proc/self/ns/net", O_RDWR|O_CLOEXEC);
1542                 if (netns_fd < 0) {
1543                         log_error("Failed to open network namespace fd: %m");
1544                         goto finish;
1545                 }
1546         }
1547
1548         if (access("/dev/kdbus/control", F_OK) >= 0) {
1549
1550                 if (arg_share_system) {
1551                         kdbus_domain = strdup("/dev/kdbus");
1552                         if (!kdbus_domain) {
1553                                 log_oom();
1554                                 goto finish;
1555                         }
1556                 } else {
1557                         const char *ns;
1558
1559                         ns = strappenda("machine-", arg_machine);
1560                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1561                         if (r < 0)
1562                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1563                         else
1564                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1565                 }
1566         }
1567
1568         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1569                 log_error("Failed to create kmsg socket pair: %m");
1570                 goto finish;
1571         }
1572
1573         sd_notify(0, "READY=1");
1574
1575         assert_se(sigemptyset(&mask) == 0);
1576         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1577         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1578
1579         for (;;) {
1580                 siginfo_t status;
1581
1582                 sync_fd = eventfd(0, EFD_CLOEXEC);
1583                 if (sync_fd < 0) {
1584                         log_error("Failed to create event fd: %m");
1585                         goto finish;
1586                 }
1587
1588                 pid = syscall(__NR_clone,
1589                               SIGCHLD|CLONE_NEWNS|
1590                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1591                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1592                 if (pid < 0) {
1593                         if (errno == EINVAL)
1594                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1595                         else
1596                                 log_error("clone() failed: %m");
1597
1598                         goto finish;
1599                 }
1600
1601                 if (pid == 0) {
1602                         /* child */
1603                         const char *home = NULL;
1604                         uid_t uid = (uid_t) -1;
1605                         gid_t gid = (gid_t) -1;
1606                         unsigned n_env = 2;
1607                         const char *envp[] = {
1608                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1609                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1610                                 NULL, /* TERM */
1611                                 NULL, /* HOME */
1612                                 NULL, /* USER */
1613                                 NULL, /* LOGNAME */
1614                                 NULL, /* container_uuid */
1615                                 NULL, /* LISTEN_FDS */
1616                                 NULL, /* LISTEN_PID */
1617                                 NULL
1618                         };
1619                         char **env_use;
1620                         eventfd_t x;
1621
1622                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1623                         if (envp[n_env])
1624                                 n_env ++;
1625
1626                         close_nointr_nofail(master);
1627                         master = -1;
1628
1629                         close_nointr(STDIN_FILENO);
1630                         close_nointr(STDOUT_FILENO);
1631                         close_nointr(STDERR_FILENO);
1632
1633                         close_nointr_nofail(kmsg_socket_pair[0]);
1634                         kmsg_socket_pair[0] = -1;
1635
1636                         reset_all_signal_handlers();
1637
1638                         assert_se(sigemptyset(&mask) == 0);
1639                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1640
1641                         k = open_terminal(console, O_RDWR);
1642                         if (k != STDIN_FILENO) {
1643                                 if (k >= 0) {
1644                                         close_nointr_nofail(k);
1645                                         k = -EINVAL;
1646                                 }
1647
1648                                 log_error("Failed to open console: %s", strerror(-k));
1649                                 goto child_fail;
1650                         }
1651
1652                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1653                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1654                                 log_error("Failed to duplicate console: %m");
1655                                 goto child_fail;
1656                         }
1657
1658                         if (setsid() < 0) {
1659                                 log_error("setsid() failed: %m");
1660                                 goto child_fail;
1661                         }
1662
1663                         if (reset_audit_loginuid() < 0)
1664                                 goto child_fail;
1665
1666                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1667                                 log_error("PR_SET_PDEATHSIG failed: %m");
1668                                 goto child_fail;
1669                         }
1670
1671                         /* Mark everything as slave, so that we still
1672                          * receive mounts from the real root, but don't
1673                          * propagate mounts to the real root. */
1674                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1675                                 log_error("MS_SLAVE|MS_REC failed: %m");
1676                                 goto child_fail;
1677                         }
1678
1679                         /* Turn directory into bind mount */
1680                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1681                                 log_error("Failed to make bind mount.");
1682                                 goto child_fail;
1683                         }
1684
1685                         if (arg_read_only)
1686                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1687                                         log_error("Failed to make read-only.");
1688                                         goto child_fail;
1689                                 }
1690
1691                         if (mount_all(arg_directory) < 0)
1692                                 goto child_fail;
1693
1694                         if (copy_devnodes(arg_directory) < 0)
1695                                 goto child_fail;
1696
1697                         if (setup_ptmx(arg_directory) < 0)
1698                                 goto child_fail;
1699
1700                         dev_setup(arg_directory);
1701
1702                         if (setup_veth(netns_fd) < 0)
1703                                 goto child_fail;
1704
1705                         if (netns_fd >= 0) {
1706                                 close_nointr_nofail(netns_fd);
1707                                 netns_fd = -1;
1708                         }
1709
1710                         if (setup_dev_console(arg_directory, console) < 0)
1711                                 goto child_fail;
1712
1713                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1714                                 goto child_fail;
1715
1716                         close_nointr_nofail(kmsg_socket_pair[1]);
1717                         kmsg_socket_pair[1] = -1;
1718
1719                         if (setup_boot_id(arg_directory) < 0)
1720                                 goto child_fail;
1721
1722                         if (setup_timezone(arg_directory) < 0)
1723                                 goto child_fail;
1724
1725                         if (setup_resolv_conf(arg_directory) < 0)
1726                                 goto child_fail;
1727
1728                         if (setup_journal(arg_directory) < 0)
1729                                 goto child_fail;
1730
1731                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1732                                 goto child_fail;
1733
1734                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1735                                 goto child_fail;
1736
1737                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1738                                 goto child_fail;
1739
1740                         if (chdir(arg_directory) < 0) {
1741                                 log_error("chdir(%s) failed: %m", arg_directory);
1742                                 goto child_fail;
1743                         }
1744
1745                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1746                                 log_error("mount(MS_MOVE) failed: %m");
1747                                 goto child_fail;
1748                         }
1749
1750                         if (chroot(".") < 0) {
1751                                 log_error("chroot() failed: %m");
1752                                 goto child_fail;
1753                         }
1754
1755                         if (chdir("/") < 0) {
1756                                 log_error("chdir() failed: %m");
1757                                 goto child_fail;
1758                         }
1759
1760                         umask(0022);
1761
1762                         if (arg_private_network)
1763                                 loopback_setup();
1764
1765                         if (drop_capabilities() < 0) {
1766                                 log_error("drop_capabilities() failed: %m");
1767                                 goto child_fail;
1768                         }
1769
1770                         if (arg_user) {
1771
1772                                 /* Note that this resolves user names
1773                                  * inside the container, and hence
1774                                  * accesses the NSS modules from the
1775                                  * container and not the host. This is
1776                                  * a bit weird... */
1777
1778                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1779                                         log_error("get_user_creds() failed: %m");
1780                                         goto child_fail;
1781                                 }
1782
1783                                 if (mkdir_parents_label(home, 0775) < 0) {
1784                                         log_error("mkdir_parents_label() failed: %m");
1785                                         goto child_fail;
1786                                 }
1787
1788                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1789                                         log_error("mkdir_safe_label() failed: %m");
1790                                         goto child_fail;
1791                                 }
1792
1793                                 if (initgroups((const char*)arg_user, gid) < 0) {
1794                                         log_error("initgroups() failed: %m");
1795                                         goto child_fail;
1796                                 }
1797
1798                                 if (setresgid(gid, gid, gid) < 0) {
1799                                         log_error("setregid() failed: %m");
1800                                         goto child_fail;
1801                                 }
1802
1803                                 if (setresuid(uid, uid, uid) < 0) {
1804                                         log_error("setreuid() failed: %m");
1805                                         goto child_fail;
1806                                 }
1807                         } else {
1808                                 /* Reset everything fully to 0, just in case */
1809
1810                                 if (setgroups(0, NULL) < 0) {
1811                                         log_error("setgroups() failed: %m");
1812                                         goto child_fail;
1813                                 }
1814
1815                                 if (setresgid(0, 0, 0) < 0) {
1816                                         log_error("setregid() failed: %m");
1817                                         goto child_fail;
1818                                 }
1819
1820                                 if (setresuid(0, 0, 0) < 0) {
1821                                         log_error("setreuid() failed: %m");
1822                                         goto child_fail;
1823                                 }
1824                         }
1825
1826                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1827                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1828                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1829                                 log_oom();
1830                                 goto child_fail;
1831                         }
1832
1833                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1834                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1835                                         log_oom();
1836                                         goto child_fail;
1837                                 }
1838                         }
1839
1840                         if (fdset_size(fds) > 0) {
1841                                 k = fdset_cloexec(fds, false);
1842                                 if (k < 0) {
1843                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1844                                         goto child_fail;
1845                                 }
1846
1847                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1848                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1849                                         log_oom();
1850                                         goto child_fail;
1851                                 }
1852                         }
1853
1854                         setup_hostname();
1855
1856                         eventfd_read(sync_fd, &x);
1857                         close_nointr_nofail(sync_fd);
1858                         sync_fd = -1;
1859
1860                         if (!strv_isempty(arg_setenv)) {
1861                                 char **n;
1862
1863                                 n = strv_env_merge(2, envp, arg_setenv);
1864                                 if (!n) {
1865                                         log_oom();
1866                                         goto child_fail;
1867                                 }
1868
1869                                 env_use = n;
1870                         } else
1871                                 env_use = (char**) envp;
1872
1873 #ifdef HAVE_SELINUX
1874                         if (arg_selinux_context)
1875                                 if (setexeccon(arg_selinux_context) < 0)
1876                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
1877 #endif
1878                         if (arg_boot) {
1879                                 char **a;
1880                                 size_t l;
1881
1882                                 /* Automatically search for the init system */
1883
1884                                 l = 1 + argc - optind;
1885                                 a = newa(char*, l + 1);
1886                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1887
1888                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1889                                 execve(a[0], a, env_use);
1890
1891                                 a[0] = (char*) "/lib/systemd/systemd";
1892                                 execve(a[0], a, env_use);
1893
1894                                 a[0] = (char*) "/sbin/init";
1895                                 execve(a[0], a, env_use);
1896                         } else if (argc > optind)
1897                                 execvpe(argv[optind], argv + optind, env_use);
1898                         else {
1899                                 chdir(home ? home : "/root");
1900                                 execle("/bin/bash", "-bash", NULL, env_use);
1901                         }
1902
1903                         log_error("execv() failed: %m");
1904
1905                 child_fail:
1906                         _exit(EXIT_FAILURE);
1907                 }
1908
1909                 fdset_free(fds);
1910                 fds = NULL;
1911
1912                 r = register_machine(pid);
1913                 if (r < 0)
1914                         goto finish;
1915
1916                 r = move_network_interfaces(pid);
1917                 if (r < 0)
1918                         goto finish;
1919
1920                 eventfd_write(sync_fd, 1);
1921                 close_nointr_nofail(sync_fd);
1922                 sync_fd = -1;
1923
1924                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1925                 if (k < 0) {
1926                         r = EXIT_FAILURE;
1927                         break;
1928                 }
1929
1930                 if (!arg_quiet)
1931                         putc('\n', stdout);
1932
1933                 /* Kill if it is not dead yet anyway */
1934                 terminate_machine(pid);
1935
1936                 /* Redundant, but better safe than sorry */
1937                 kill(pid, SIGKILL);
1938
1939                 k = wait_for_terminate(pid, &status);
1940                 pid = 0;
1941
1942                 if (k < 0) {
1943                         r = EXIT_FAILURE;
1944                         break;
1945                 }
1946
1947                 if (status.si_code == CLD_EXITED) {
1948                         r = status.si_status;
1949                         if (status.si_status != 0) {
1950                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1951                                 break;
1952                         }
1953
1954                         if (!arg_quiet)
1955                                 log_debug("Container %s exited successfully.", arg_machine);
1956                         break;
1957                 } else if (status.si_code == CLD_KILLED &&
1958                            status.si_status == SIGINT) {
1959
1960                         if (!arg_quiet)
1961                                 log_info("Container %s has been shut down.", arg_machine);
1962                         r = 0;
1963                         break;
1964                 } else if (status.si_code == CLD_KILLED &&
1965                            status.si_status == SIGHUP) {
1966
1967                         if (!arg_quiet)
1968                                 log_info("Container %s is being rebooted.", arg_machine);
1969                         continue;
1970                 } else if (status.si_code == CLD_KILLED ||
1971                            status.si_code == CLD_DUMPED) {
1972
1973                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
1974                         r = EXIT_FAILURE;
1975                         break;
1976                 } else {
1977                         log_error("Container %s failed due to unknown reason.", arg_machine);
1978                         r = EXIT_FAILURE;
1979                         break;
1980                 }
1981         }
1982
1983 finish:
1984         if (pid > 0)
1985                 kill(pid, SIGKILL);
1986
1987         free(arg_directory);
1988         free(arg_machine);
1989         free(arg_setenv);
1990         free(arg_network_interfaces);
1991
1992         return r;
1993 }