chiark / gitweb /
nspawn: make use of the devices cgroup controller by default
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #include "sd-daemon.h"
57 #include "sd-bus.h"
58 #include "sd-id128.h"
59 #include "sd-rtnl.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "macro.h"
64 #include "audit.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "bus-kernel.h"
78 #include "env-util.h"
79 #include "def.h"
80 #include "rtnl-util.h"
81 #include "udev-util.h"
82
83 #ifdef HAVE_SECCOMP
84 #include "seccomp-util.h"
85 #endif
86
87 typedef enum LinkJournal {
88         LINK_NO,
89         LINK_AUTO,
90         LINK_HOST,
91         LINK_GUEST
92 } LinkJournal;
93
94 static char *arg_directory = NULL;
95 static char *arg_user = NULL;
96 static sd_id128_t arg_uuid = {};
97 static char *arg_machine = NULL;
98 static char *arg_selinux_context = NULL;
99 static char *arg_selinux_apifs_context = NULL;
100 static const char *arg_slice = NULL;
101 static bool arg_private_network = false;
102 static bool arg_read_only = false;
103 static bool arg_boot = false;
104 static LinkJournal arg_link_journal = LINK_AUTO;
105 static uint64_t arg_retain =
106         (1ULL << CAP_CHOWN) |
107         (1ULL << CAP_DAC_OVERRIDE) |
108         (1ULL << CAP_DAC_READ_SEARCH) |
109         (1ULL << CAP_FOWNER) |
110         (1ULL << CAP_FSETID) |
111         (1ULL << CAP_IPC_OWNER) |
112         (1ULL << CAP_KILL) |
113         (1ULL << CAP_LEASE) |
114         (1ULL << CAP_LINUX_IMMUTABLE) |
115         (1ULL << CAP_NET_BIND_SERVICE) |
116         (1ULL << CAP_NET_BROADCAST) |
117         (1ULL << CAP_NET_RAW) |
118         (1ULL << CAP_SETGID) |
119         (1ULL << CAP_SETFCAP) |
120         (1ULL << CAP_SETPCAP) |
121         (1ULL << CAP_SETUID) |
122         (1ULL << CAP_SYS_ADMIN) |
123         (1ULL << CAP_SYS_CHROOT) |
124         (1ULL << CAP_SYS_NICE) |
125         (1ULL << CAP_SYS_PTRACE) |
126         (1ULL << CAP_SYS_TTY_CONFIG) |
127         (1ULL << CAP_SYS_RESOURCE) |
128         (1ULL << CAP_SYS_BOOT) |
129         (1ULL << CAP_AUDIT_WRITE) |
130         (1ULL << CAP_AUDIT_CONTROL) |
131         (1ULL << CAP_MKNOD);
132 static char **arg_bind = NULL;
133 static char **arg_bind_ro = NULL;
134 static char **arg_setenv = NULL;
135 static bool arg_quiet = false;
136 static bool arg_share_system = false;
137 static bool arg_register = true;
138 static bool arg_keep_unit = false;
139 static char **arg_network_interfaces = NULL;
140 static bool arg_network_veth = false;
141 static char *arg_network_bridge = NULL;
142 static unsigned long arg_personality = 0xffffffffLU;
143
144 static int help(void) {
145
146         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
147                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
148                "  -h --help                 Show this help\n"
149                "     --version              Print version string\n"
150                "  -q --quiet                Do not show status information\n"
151                "  -D --directory=NAME       Root directory for the container\n"
152                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
153                "  -u --user=USER            Run the command under specified user or uid\n"
154                "  -M --machine=NAME         Set the machine name for the container\n"
155                "     --uuid=UUID            Set a specific machine UUID for the container\n"
156                "  -S --slice=SLICE          Place the container in the specified slice\n"
157                "     --private-network      Disable network in container\n"
158                "     --network-interface=INTERFACE\n"
159                "                            Assign an existing network interface to the\n"
160                "                            container\n"
161                "     --network-veth         Add a virtual ethernet connection between host\n"
162                "                            and container\n"
163                "     --network-bridge=INTERFACE\n"
164                "                            Add a virtual ethernet connection between host\n"
165                "                            and container and add it to an existing bridge on\n"
166                "                            the host\n"
167                "  -Z --selinux-context=SECLABEL\n"
168                "                            Set the SELinux security context to be used by\n"
169                "                            processes in the container\n"
170                "  -L --selinux-apifs-context=SECLABEL\n"
171                "                            Set the SELinux security context to be used by\n"
172                "                            API/tmpfs file systems in the container\n"
173                "     --capability=CAP       In addition to the default, retain specified\n"
174                "                            capability\n"
175                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
176                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
177                "  -j                        Equivalent to --link-journal=host\n"
178                "     --read-only            Mount the root directory read-only\n"
179                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
180                "                            the container\n"
181                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
182                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
183                "     --share-system         Share system namespaces with host\n"
184                "     --register=BOOLEAN     Register container as machine\n"
185                "     --keep-unit            Do not register a scope for the machine, reuse\n"
186                "                            the service unit nspawn is running in\n",
187                program_invocation_short_name);
188
189         return 0;
190 }
191
192 static int parse_argv(int argc, char *argv[]) {
193
194         enum {
195                 ARG_VERSION = 0x100,
196                 ARG_PRIVATE_NETWORK,
197                 ARG_UUID,
198                 ARG_READ_ONLY,
199                 ARG_CAPABILITY,
200                 ARG_DROP_CAPABILITY,
201                 ARG_LINK_JOURNAL,
202                 ARG_BIND,
203                 ARG_BIND_RO,
204                 ARG_SETENV,
205                 ARG_SHARE_SYSTEM,
206                 ARG_REGISTER,
207                 ARG_KEEP_UNIT,
208                 ARG_NETWORK_INTERFACE,
209                 ARG_NETWORK_VETH,
210                 ARG_NETWORK_BRIDGE,
211                 ARG_PERSONALITY,
212         };
213
214         static const struct option options[] = {
215                 { "help",                  no_argument,       NULL, 'h'                   },
216                 { "version",               no_argument,       NULL, ARG_VERSION           },
217                 { "directory",             required_argument, NULL, 'D'                   },
218                 { "user",                  required_argument, NULL, 'u'                   },
219                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
220                 { "boot",                  no_argument,       NULL, 'b'                   },
221                 { "uuid",                  required_argument, NULL, ARG_UUID              },
222                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
223                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
224                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
225                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
226                 { "bind",                  required_argument, NULL, ARG_BIND              },
227                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
228                 { "machine",               required_argument, NULL, 'M'                   },
229                 { "slice",                 required_argument, NULL, 'S'                   },
230                 { "setenv",                required_argument, NULL, ARG_SETENV            },
231                 { "selinux-context",       required_argument, NULL, 'Z'                   },
232                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
233                 { "quiet",                 no_argument,       NULL, 'q'                   },
234                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
235                 { "register",              required_argument, NULL, ARG_REGISTER          },
236                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
237                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
238                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
239                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
240                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
241                 {}
242         };
243
244         int c, r;
245         uint64_t plus = 0, minus = 0;
246
247         assert(argc >= 0);
248         assert(argv);
249
250         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
251
252                 switch (c) {
253
254                 case 'h':
255                         return help();
256
257                 case ARG_VERSION:
258                         puts(PACKAGE_STRING);
259                         puts(SYSTEMD_FEATURES);
260                         return 0;
261
262                 case 'D':
263                         free(arg_directory);
264                         arg_directory = canonicalize_file_name(optarg);
265                         if (!arg_directory) {
266                                 log_error("Invalid root directory: %m");
267                                 return -ENOMEM;
268                         }
269
270                         break;
271
272                 case 'u':
273                         free(arg_user);
274                         arg_user = strdup(optarg);
275                         if (!arg_user)
276                                 return log_oom();
277
278                         break;
279
280                 case ARG_NETWORK_BRIDGE:
281                         arg_network_bridge = strdup(optarg);
282                         if (!arg_network_bridge)
283                                 return log_oom();
284
285                         /* fall through */
286
287                 case ARG_NETWORK_VETH:
288                         arg_network_veth = true;
289                         arg_private_network = true;
290                         break;
291
292                 case ARG_NETWORK_INTERFACE:
293                         if (strv_push(&arg_network_interfaces, optarg) < 0)
294                                 return log_oom();
295
296                         /* fall through */
297
298                 case ARG_PRIVATE_NETWORK:
299                         arg_private_network = true;
300                         break;
301
302                 case 'b':
303                         arg_boot = true;
304                         break;
305
306                 case ARG_UUID:
307                         r = sd_id128_from_string(optarg, &arg_uuid);
308                         if (r < 0) {
309                                 log_error("Invalid UUID: %s", optarg);
310                                 return r;
311                         }
312                         break;
313
314                 case 'S':
315                         arg_slice = strdup(optarg);
316                         if (!arg_slice)
317                                 return log_oom();
318
319                         break;
320
321                 case 'M':
322                         if (isempty(optarg)) {
323                                 free(arg_machine);
324                                 arg_machine = NULL;
325                         } else {
326
327                                 if (!hostname_is_valid(optarg)) {
328                                         log_error("Invalid machine name: %s", optarg);
329                                         return -EINVAL;
330                                 }
331
332                                 free(arg_machine);
333                                 arg_machine = strdup(optarg);
334                                 if (!arg_machine)
335                                         return log_oom();
336
337                                 break;
338                         }
339
340                 case 'Z':
341                         arg_selinux_context = optarg;
342                         break;
343
344                 case 'L':
345                         arg_selinux_apifs_context = optarg;
346                         break;
347
348                 case ARG_READ_ONLY:
349                         arg_read_only = true;
350                         break;
351
352                 case ARG_CAPABILITY:
353                 case ARG_DROP_CAPABILITY: {
354                         char *state, *word;
355                         size_t length;
356
357                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
358                                 _cleanup_free_ char *t;
359                                 cap_value_t cap;
360
361                                 t = strndup(word, length);
362                                 if (!t)
363                                         return log_oom();
364
365                                 if (streq(t, "all")) {
366                                         if (c == ARG_CAPABILITY)
367                                                 plus = (uint64_t) -1;
368                                         else
369                                                 minus = (uint64_t) -1;
370                                 } else {
371                                         if (cap_from_name(t, &cap) < 0) {
372                                                 log_error("Failed to parse capability %s.", t);
373                                                 return -EINVAL;
374                                         }
375
376                                         if (c == ARG_CAPABILITY)
377                                                 plus |= 1ULL << (uint64_t) cap;
378                                         else
379                                                 minus |= 1ULL << (uint64_t) cap;
380                                 }
381                         }
382
383                         break;
384                 }
385
386                 case 'j':
387                         arg_link_journal = LINK_GUEST;
388                         break;
389
390                 case ARG_LINK_JOURNAL:
391                         if (streq(optarg, "auto"))
392                                 arg_link_journal = LINK_AUTO;
393                         else if (streq(optarg, "no"))
394                                 arg_link_journal = LINK_NO;
395                         else if (streq(optarg, "guest"))
396                                 arg_link_journal = LINK_GUEST;
397                         else if (streq(optarg, "host"))
398                                 arg_link_journal = LINK_HOST;
399                         else {
400                                 log_error("Failed to parse link journal mode %s", optarg);
401                                 return -EINVAL;
402                         }
403
404                         break;
405
406                 case ARG_BIND:
407                 case ARG_BIND_RO: {
408                         _cleanup_free_ char *a = NULL, *b = NULL;
409                         char *e;
410                         char ***x;
411
412                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
413
414                         e = strchr(optarg, ':');
415                         if (e) {
416                                 a = strndup(optarg, e - optarg);
417                                 b = strdup(e + 1);
418                         } else {
419                                 a = strdup(optarg);
420                                 b = strdup(optarg);
421                         }
422
423                         if (!a || !b)
424                                 return log_oom();
425
426                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
427                                 log_error("Invalid bind mount specification: %s", optarg);
428                                 return -EINVAL;
429                         }
430
431                         r = strv_extend(x, a);
432                         if (r < 0)
433                                 return log_oom();
434
435                         r = strv_extend(x, b);
436                         if (r < 0)
437                                 return log_oom();
438
439                         break;
440                 }
441
442                 case ARG_SETENV: {
443                         char **n;
444
445                         if (!env_assignment_is_valid(optarg)) {
446                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
447                                 return -EINVAL;
448                         }
449
450                         n = strv_env_set(arg_setenv, optarg);
451                         if (!n)
452                                 return log_oom();
453
454                         strv_free(arg_setenv);
455                         arg_setenv = n;
456                         break;
457                 }
458
459                 case 'q':
460                         arg_quiet = true;
461                         break;
462
463                 case ARG_SHARE_SYSTEM:
464                         arg_share_system = true;
465                         break;
466
467                 case ARG_REGISTER:
468                         r = parse_boolean(optarg);
469                         if (r < 0) {
470                                 log_error("Failed to parse --register= argument: %s", optarg);
471                                 return r;
472                         }
473
474                         arg_register = r;
475                         break;
476
477                 case ARG_KEEP_UNIT:
478                         arg_keep_unit = true;
479                         break;
480
481                 case ARG_PERSONALITY:
482
483                         arg_personality = personality_from_string(optarg);
484                         if (arg_personality == 0xffffffffLU) {
485                                 log_error("Unknown or unsupported personality '%s'.", optarg);
486                                 return -EINVAL;
487                         }
488
489                         break;
490
491                 case '?':
492                         return -EINVAL;
493
494                 default:
495                         assert_not_reached("Unhandled option");
496                 }
497         }
498
499         if (arg_share_system)
500                 arg_register = false;
501
502         if (arg_boot && arg_share_system) {
503                 log_error("--boot and --share-system may not be combined.");
504                 return -EINVAL;
505         }
506
507         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
508                 log_error("--keep-unit may not be used when invoked from a user session.");
509                 return -EINVAL;
510         }
511
512         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
513
514         return 1;
515 }
516
517 static int mount_all(const char *dest) {
518
519         typedef struct MountPoint {
520                 const char *what;
521                 const char *where;
522                 const char *type;
523                 const char *options;
524                 unsigned long flags;
525                 bool fatal;
526         } MountPoint;
527
528         static const MountPoint mount_table[] = {
529                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
530                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
531                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
532                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
533                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
534                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
535                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
536                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
537 #ifdef HAVE_SELINUX
538                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
539                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
540 #endif
541         };
542
543         unsigned k;
544         int r = 0;
545
546         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
547                 _cleanup_free_ char *where = NULL;
548 #ifdef HAVE_SELINUX
549                 _cleanup_free_ char *options = NULL;
550 #endif
551                 const char *o;
552                 int t;
553
554                 where = strjoin(dest, "/", mount_table[k].where, NULL);
555                 if (!where)
556                         return log_oom();
557
558                 t = path_is_mount_point(where, true);
559                 if (t < 0) {
560                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
561
562                         if (r == 0)
563                                 r = t;
564
565                         continue;
566                 }
567
568                 /* Skip this entry if it is not a remount. */
569                 if (mount_table[k].what && t > 0)
570                         continue;
571
572                 mkdir_p(where, 0755);
573
574 #ifdef HAVE_SELINUX
575                 if (arg_selinux_apifs_context &&
576                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
577                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
578                         if (!options)
579                                 return log_oom();
580
581                         o = options;
582                 } else
583 #endif
584                         o = mount_table[k].options;
585
586
587                 if (mount(mount_table[k].what,
588                           where,
589                           mount_table[k].type,
590                           mount_table[k].flags,
591                           o) < 0 &&
592                     mount_table[k].fatal) {
593
594                         log_error("mount(%s) failed: %m", where);
595
596                         if (r == 0)
597                                 r = -errno;
598                 }
599         }
600
601         return r;
602 }
603
604 static int mount_binds(const char *dest, char **l, unsigned long flags) {
605         char **x, **y;
606
607         STRV_FOREACH_PAIR(x, y, l) {
608                 char *where;
609                 struct stat source_st, dest_st;
610                 int r;
611
612                 if (stat(*x, &source_st) < 0) {
613                         log_error("failed to stat %s: %m", *x);
614                         return -errno;
615                 }
616
617                 where = strappenda(dest, *y);
618                 r = stat(where, &dest_st);
619                 if (r == 0) {
620                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
621                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
622                                                 *x, where);
623                                 return -EINVAL;
624                         }
625                 } else if (errno == ENOENT) {
626                         r = mkdir_parents_label(where, 0755);
627                         if (r < 0) {
628                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
629                                 return r;
630                         }
631                 } else {
632                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
633                         return -errno;
634                 }
635                 /* Create the mount point, but be conservative -- refuse to create block
636                 * and char devices. */
637                 if (S_ISDIR(source_st.st_mode))
638                         mkdir_label(where, 0755);
639                 else if (S_ISFIFO(source_st.st_mode))
640                         mkfifo(where, 0644);
641                 else if (S_ISSOCK(source_st.st_mode))
642                         mknod(where, 0644 | S_IFSOCK, 0);
643                 else if (S_ISREG(source_st.st_mode))
644                         touch(where);
645                 else {
646                         log_error("Refusing to create mountpoint for file: %s", *x);
647                         return -ENOTSUP;
648                 }
649
650                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
651                         log_error("mount(%s) failed: %m", where);
652                         return -errno;
653                 }
654
655                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
656                         log_error("mount(%s) failed: %m", where);
657                         return -errno;
658                 }
659         }
660
661         return 0;
662 }
663
664 static int setup_timezone(const char *dest) {
665         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
666         char *z, *y;
667         int r;
668
669         assert(dest);
670
671         /* Fix the timezone, if possible */
672         r = readlink_malloc("/etc/localtime", &p);
673         if (r < 0) {
674                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
675                 return 0;
676         }
677
678         z = path_startswith(p, "../usr/share/zoneinfo/");
679         if (!z)
680                 z = path_startswith(p, "/usr/share/zoneinfo/");
681         if (!z) {
682                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
683                 return 0;
684         }
685
686         where = strappend(dest, "/etc/localtime");
687         if (!where)
688                 return log_oom();
689
690         r = readlink_malloc(where, &q);
691         if (r >= 0) {
692                 y = path_startswith(q, "../usr/share/zoneinfo/");
693                 if (!y)
694                         y = path_startswith(q, "/usr/share/zoneinfo/");
695
696
697                 /* Already pointing to the right place? Then do nothing .. */
698                 if (y && streq(y, z))
699                         return 0;
700         }
701
702         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
703         if (!check)
704                 return log_oom();
705
706         if (access(check, F_OK) < 0) {
707                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
708                 return 0;
709         }
710
711         what = strappend("../usr/share/zoneinfo/", z);
712         if (!what)
713                 return log_oom();
714
715         unlink(where);
716         if (symlink(what, where) < 0) {
717                 log_error("Failed to correct timezone of container: %m");
718                 return 0;
719         }
720
721         return 0;
722 }
723
724 static int setup_resolv_conf(const char *dest) {
725         char _cleanup_free_ *where = NULL;
726
727         assert(dest);
728
729         if (arg_private_network)
730                 return 0;
731
732         /* Fix resolv.conf, if possible */
733         where = strappend(dest, "/etc/resolv.conf");
734         if (!where)
735                 return log_oom();
736
737         /* We don't really care for the results of this really. If it
738          * fails, it fails, but meh... */
739         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
740
741         return 0;
742 }
743
744 static int setup_boot_id(const char *dest) {
745         _cleanup_free_ char *from = NULL, *to = NULL;
746         sd_id128_t rnd = {};
747         char as_uuid[37];
748         int r;
749
750         assert(dest);
751
752         if (arg_share_system)
753                 return 0;
754
755         /* Generate a new randomized boot ID, so that each boot-up of
756          * the container gets a new one */
757
758         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
759         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
760         if (!from || !to)
761                 return log_oom();
762
763         r = sd_id128_randomize(&rnd);
764         if (r < 0) {
765                 log_error("Failed to generate random boot id: %s", strerror(-r));
766                 return r;
767         }
768
769         snprintf(as_uuid, sizeof(as_uuid),
770                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
771                  SD_ID128_FORMAT_VAL(rnd));
772         char_array_0(as_uuid);
773
774         r = write_string_file(from, as_uuid);
775         if (r < 0) {
776                 log_error("Failed to write boot id: %s", strerror(-r));
777                 return r;
778         }
779
780         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
781                 log_error("Failed to bind mount boot id: %m");
782                 r = -errno;
783         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
784                 log_warning("Failed to make boot id read-only: %m");
785
786         unlink(from);
787         return r;
788 }
789
790 static int copy_devnodes(const char *dest) {
791
792         static const char devnodes[] =
793                 "null\0"
794                 "zero\0"
795                 "full\0"
796                 "random\0"
797                 "urandom\0"
798                 "tty\0";
799
800         const char *d;
801         int r = 0;
802         _cleanup_umask_ mode_t u;
803
804         assert(dest);
805
806         u = umask(0000);
807
808         NULSTR_FOREACH(d, devnodes) {
809                 _cleanup_free_ char *from = NULL, *to = NULL;
810                 struct stat st;
811
812                 from = strappend("/dev/", d);
813                 to = strjoin(dest, "/dev/", d, NULL);
814                 if (!from || !to)
815                         return log_oom();
816
817                 if (stat(from, &st) < 0) {
818
819                         if (errno != ENOENT) {
820                                 log_error("Failed to stat %s: %m", from);
821                                 return -errno;
822                         }
823
824                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
825
826                         log_error("%s is not a char or block device, cannot copy", from);
827                         return -EIO;
828
829                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
830
831                         log_error("mknod(%s) failed: %m", dest);
832                         return  -errno;
833                 }
834         }
835
836         return r;
837 }
838
839 static int setup_ptmx(const char *dest) {
840         _cleanup_free_ char *p = NULL;
841
842         p = strappend(dest, "/dev/ptmx");
843         if (!p)
844                 return log_oom();
845
846         if (symlink("pts/ptmx", p) < 0) {
847                 log_error("Failed to create /dev/ptmx symlink: %m");
848                 return -errno;
849         }
850
851         return 0;
852 }
853
854 static int setup_dev_console(const char *dest, const char *console) {
855         struct stat st;
856         _cleanup_free_ char *to = NULL;
857         int r;
858         _cleanup_umask_ mode_t u;
859
860         assert(dest);
861         assert(console);
862
863         u = umask(0000);
864
865         if (stat(console, &st) < 0) {
866                 log_error("Failed to stat %s: %m", console);
867                 return -errno;
868
869         } else if (!S_ISCHR(st.st_mode)) {
870                 log_error("/dev/console is not a char device");
871                 return -EIO;
872         }
873
874         r = chmod_and_chown(console, 0600, 0, 0);
875         if (r < 0) {
876                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
877                 return r;
878         }
879
880         if (asprintf(&to, "%s/dev/console", dest) < 0)
881                 return log_oom();
882
883         /* We need to bind mount the right tty to /dev/console since
884          * ptys can only exist on pts file systems. To have something
885          * to bind mount things on we create a device node first, that
886          * has the right major/minor (note that the major minor
887          * doesn't actually matter here, since we mount it over
888          * anyway). */
889
890         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
891                 log_error("mknod() for /dev/console failed: %m");
892                 return -errno;
893         }
894
895         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
896                 log_error("Bind mount for /dev/console failed: %m");
897                 return -errno;
898         }
899
900         return 0;
901 }
902
903 static int setup_kmsg(const char *dest, int kmsg_socket) {
904         _cleanup_free_ char *from = NULL, *to = NULL;
905         int r, fd, k;
906         _cleanup_umask_ mode_t u;
907         union {
908                 struct cmsghdr cmsghdr;
909                 uint8_t buf[CMSG_SPACE(sizeof(int))];
910         } control = {};
911         struct msghdr mh = {
912                 .msg_control = &control,
913                 .msg_controllen = sizeof(control),
914         };
915         struct cmsghdr *cmsg;
916
917         assert(dest);
918         assert(kmsg_socket >= 0);
919
920         u = umask(0000);
921
922         /* We create the kmsg FIFO as /dev/kmsg, but immediately
923          * delete it after bind mounting it to /proc/kmsg. While FIFOs
924          * on the reading side behave very similar to /proc/kmsg,
925          * their writing side behaves differently from /dev/kmsg in
926          * that writing blocks when nothing is reading. In order to
927          * avoid any problems with containers deadlocking due to this
928          * we simply make /dev/kmsg unavailable to the container. */
929         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
930             asprintf(&to, "%s/proc/kmsg", dest) < 0)
931                 return log_oom();
932
933         if (mkfifo(from, 0600) < 0) {
934                 log_error("mkfifo() for /dev/kmsg failed: %m");
935                 return -errno;
936         }
937
938         r = chmod_and_chown(from, 0600, 0, 0);
939         if (r < 0) {
940                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
941                 return r;
942         }
943
944         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
945                 log_error("Bind mount for /proc/kmsg failed: %m");
946                 return -errno;
947         }
948
949         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
950         if (fd < 0) {
951                 log_error("Failed to open fifo: %m");
952                 return -errno;
953         }
954
955         cmsg = CMSG_FIRSTHDR(&mh);
956         cmsg->cmsg_level = SOL_SOCKET;
957         cmsg->cmsg_type = SCM_RIGHTS;
958         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
959         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
960
961         mh.msg_controllen = cmsg->cmsg_len;
962
963         /* Store away the fd in the socket, so that it stays open as
964          * long as we run the child */
965         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
966         close_nointr_nofail(fd);
967
968         if (k < 0) {
969                 log_error("Failed to send FIFO fd: %m");
970                 return -errno;
971         }
972
973         /* And now make the FIFO unavailable as /dev/kmsg... */
974         unlink(from);
975         return 0;
976 }
977
978 static int setup_hostname(void) {
979
980         if (arg_share_system)
981                 return 0;
982
983         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
984                 return -errno;
985
986         return 0;
987 }
988
989 static int setup_journal(const char *directory) {
990         sd_id128_t machine_id, this_id;
991         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
992         char *id;
993         int r;
994
995         p = strappend(directory, "/etc/machine-id");
996         if (!p)
997                 return log_oom();
998
999         r = read_one_line_file(p, &b);
1000         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1001                 return 0;
1002         else if (r < 0) {
1003                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1004                 return r;
1005         }
1006
1007         id = strstrip(b);
1008         if (isempty(id) && arg_link_journal == LINK_AUTO)
1009                 return 0;
1010
1011         /* Verify validity */
1012         r = sd_id128_from_string(id, &machine_id);
1013         if (r < 0) {
1014                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1015                 return r;
1016         }
1017
1018         r = sd_id128_get_machine(&this_id);
1019         if (r < 0) {
1020                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1021                 return r;
1022         }
1023
1024         if (sd_id128_equal(machine_id, this_id)) {
1025                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1026                          "Host and machine ids are equal (%s): refusing to link journals", id);
1027                 if (arg_link_journal == LINK_AUTO)
1028                         return 0;
1029                 return
1030                         -EEXIST;
1031         }
1032
1033         if (arg_link_journal == LINK_NO)
1034                 return 0;
1035
1036         free(p);
1037         p = strappend("/var/log/journal/", id);
1038         q = strjoin(directory, "/var/log/journal/", id, NULL);
1039         if (!p || !q)
1040                 return log_oom();
1041
1042         if (path_is_mount_point(p, false) > 0) {
1043                 if (arg_link_journal != LINK_AUTO) {
1044                         log_error("%s: already a mount point, refusing to use for journal", p);
1045                         return -EEXIST;
1046                 }
1047
1048                 return 0;
1049         }
1050
1051         if (path_is_mount_point(q, false) > 0) {
1052                 if (arg_link_journal != LINK_AUTO) {
1053                         log_error("%s: already a mount point, refusing to use for journal", q);
1054                         return -EEXIST;
1055                 }
1056
1057                 return 0;
1058         }
1059
1060         r = readlink_and_make_absolute(p, &d);
1061         if (r >= 0) {
1062                 if ((arg_link_journal == LINK_GUEST ||
1063                      arg_link_journal == LINK_AUTO) &&
1064                     path_equal(d, q)) {
1065
1066                         r = mkdir_p(q, 0755);
1067                         if (r < 0)
1068                                 log_warning("failed to create directory %s: %m", q);
1069                         return 0;
1070                 }
1071
1072                 if (unlink(p) < 0) {
1073                         log_error("Failed to remove symlink %s: %m", p);
1074                         return -errno;
1075                 }
1076         } else if (r == -EINVAL) {
1077
1078                 if (arg_link_journal == LINK_GUEST &&
1079                     rmdir(p) < 0) {
1080
1081                         if (errno == ENOTDIR) {
1082                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1083                                 return r;
1084                         } else {
1085                                 log_error("Failed to remove %s: %m", p);
1086                                 return -errno;
1087                         }
1088                 }
1089         } else if (r != -ENOENT) {
1090                 log_error("readlink(%s) failed: %m", p);
1091                 return r;
1092         }
1093
1094         if (arg_link_journal == LINK_GUEST) {
1095
1096                 if (symlink(q, p) < 0) {
1097                         log_error("Failed to symlink %s to %s: %m", q, p);
1098                         return -errno;
1099                 }
1100
1101                 r = mkdir_p(q, 0755);
1102                 if (r < 0)
1103                         log_warning("failed to create directory %s: %m", q);
1104                 return 0;
1105         }
1106
1107         if (arg_link_journal == LINK_HOST) {
1108                 r = mkdir_p(p, 0755);
1109                 if (r < 0) {
1110                         log_error("Failed to create %s: %m", p);
1111                         return r;
1112                 }
1113
1114         } else if (access(p, F_OK) < 0)
1115                 return 0;
1116
1117         if (dir_is_empty(q) == 0) {
1118                 log_error("%s not empty.", q);
1119                 return -ENOTEMPTY;
1120         }
1121
1122         r = mkdir_p(q, 0755);
1123         if (r < 0) {
1124                 log_error("Failed to create %s: %m", q);
1125                 return r;
1126         }
1127
1128         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1129                 log_error("Failed to bind mount journal from host into guest: %m");
1130                 return -errno;
1131         }
1132
1133         return 0;
1134 }
1135
1136 static int setup_kdbus(const char *dest, const char *path) {
1137         const char *p;
1138
1139         if (!path)
1140                 return 0;
1141
1142         p = strappenda(dest, "/dev/kdbus");
1143         if (mkdir(p, 0755) < 0) {
1144                 log_error("Failed to create kdbus path: %m");
1145                 return  -errno;
1146         }
1147
1148         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1149                 log_error("Failed to mount kdbus domain path: %m");
1150                 return -errno;
1151         }
1152
1153         return 0;
1154 }
1155
1156 static int drop_capabilities(void) {
1157         return capability_bounding_set_drop(~arg_retain, false);
1158 }
1159
1160 static int register_machine(pid_t pid) {
1161         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1162         _cleanup_bus_unref_ sd_bus *bus = NULL;
1163         int r;
1164
1165         if (!arg_register)
1166                 return 0;
1167
1168         r = sd_bus_default_system(&bus);
1169         if (r < 0) {
1170                 log_error("Failed to open system bus: %s", strerror(-r));
1171                 return r;
1172         }
1173
1174         if (arg_keep_unit) {
1175                 r = sd_bus_call_method(
1176                                 bus,
1177                                 "org.freedesktop.machine1",
1178                                 "/org/freedesktop/machine1",
1179                                 "org.freedesktop.machine1.Manager",
1180                                 "RegisterMachine",
1181                                 &error,
1182                                 NULL,
1183                                 "sayssus",
1184                                 arg_machine,
1185                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1186                                 "nspawn",
1187                                 "container",
1188                                 (uint32_t) pid,
1189                                 strempty(arg_directory));
1190         } else {
1191                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1192
1193                 r = sd_bus_message_new_method_call(
1194                                 bus,
1195                                 &m,
1196                                 "org.freedesktop.machine1",
1197                                 "/org/freedesktop/machine1",
1198                                 "org.freedesktop.machine1.Manager",
1199                                 "CreateMachine");
1200                 if (r < 0) {
1201                         log_error("Failed to create message: %s", strerror(-r));
1202                         return r;
1203                 }
1204
1205                 r = sd_bus_message_append(
1206                                 m,
1207                                 "sayssus",
1208                                 arg_machine,
1209                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1210                                 "nspawn",
1211                                 "container",
1212                                 (uint32_t) pid,
1213                                 strempty(arg_directory));
1214                 if (r < 0) {
1215                         log_error("Failed to append message arguments: %s", strerror(-r));
1216                         return r;
1217                 }
1218
1219                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1220                 if (r < 0) {
1221                         log_error("Failed to open container: %s", strerror(-r));
1222                         return r;
1223                 }
1224
1225                 if (!isempty(arg_slice)) {
1226                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1227                         if (r < 0) {
1228                                 log_error("Failed to append slice: %s", strerror(-r));
1229                                 return r;
1230                         }
1231                 }
1232
1233                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1234                 if (r < 0) {
1235                         log_error("Failed to add device policy: %s", strerror(-r));
1236                         return r;
1237                 }
1238
1239                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 8,
1240                                           /* Allow the container to
1241                                            * access and create the API
1242                                            * device nodes, so that
1243                                            * PrivateDevices= in the
1244                                            * container can work
1245                                            * fine */
1246                                           "/dev/null", "rwm",
1247                                           "/dev/zero", "rwm",
1248                                           "/dev/full", "rwm",
1249                                           "/dev/random", "rwm",
1250                                           "/dev/urandom", "rwm",
1251                                           "/dev/tty", "rwm",
1252                                           /* Allow the container
1253                                            * access to ptys. However,
1254                                            * do not permit the
1255                                            * container to ever create
1256                                            * these device nodes. */
1257                                           "/dev/pts/ptmx", "rw",
1258                                           "char-pts", "rw");
1259                 if (r < 0) {
1260                         log_error("Failed to add device whitelist: %s", strerror(-r));
1261                         return r;
1262                 }
1263
1264                 r = sd_bus_message_close_container(m);
1265                 if (r < 0) {
1266                         log_error("Failed to close container: %s", strerror(-r));
1267                         return r;
1268                 }
1269
1270                 r = sd_bus_call(bus, m, 0, &error, NULL);
1271         }
1272
1273         if (r < 0) {
1274                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1275                 return r;
1276         }
1277
1278         return 0;
1279 }
1280
1281 static int terminate_machine(pid_t pid) {
1282         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1283         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1284         _cleanup_bus_unref_ sd_bus *bus = NULL;
1285         const char *path;
1286         int r;
1287
1288         if (!arg_register)
1289                 return 0;
1290
1291         r = sd_bus_default_system(&bus);
1292         if (r < 0) {
1293                 log_error("Failed to open system bus: %s", strerror(-r));
1294                 return r;
1295         }
1296
1297         r = sd_bus_call_method(
1298                         bus,
1299                         "org.freedesktop.machine1",
1300                         "/org/freedesktop/machine1",
1301                         "org.freedesktop.machine1.Manager",
1302                         "GetMachineByPID",
1303                         &error,
1304                         &reply,
1305                         "u",
1306                         (uint32_t) pid);
1307         if (r < 0) {
1308                 /* Note that the machine might already have been
1309                  * cleaned up automatically, hence don't consider it a
1310                  * failure if we cannot get the machine object. */
1311                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1312                 return 0;
1313         }
1314
1315         r = sd_bus_message_read(reply, "o", &path);
1316         if (r < 0)
1317                 return bus_log_parse_error(r);
1318
1319         r = sd_bus_call_method(
1320                         bus,
1321                         "org.freedesktop.machine1",
1322                         path,
1323                         "org.freedesktop.machine1.Machine",
1324                         "Terminate",
1325                         &error,
1326                         NULL,
1327                         NULL);
1328         if (r < 0) {
1329                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1330                 return 0;
1331         }
1332
1333         return 0;
1334 }
1335
1336 static int reset_audit_loginuid(void) {
1337         _cleanup_free_ char *p = NULL;
1338         int r;
1339
1340         if (arg_share_system)
1341                 return 0;
1342
1343         r = read_one_line_file("/proc/self/loginuid", &p);
1344         if (r == -EEXIST)
1345                 return 0;
1346         if (r < 0) {
1347                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1348                 return r;
1349         }
1350
1351         /* Already reset? */
1352         if (streq(p, "4294967295"))
1353                 return 0;
1354
1355         r = write_string_file("/proc/self/loginuid", "4294967295");
1356         if (r < 0) {
1357                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1358                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1359                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1360                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1361                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1362
1363                 sleep(5);
1364         }
1365
1366         return 0;
1367 }
1368
1369 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1370         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1371         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1372         int r;
1373
1374         if (!arg_private_network)
1375                 return 0;
1376
1377         if (!arg_network_veth)
1378                 return 0;
1379
1380         /* Use two different interface name prefixes depending whether
1381          * we are in bridge mode or not. */
1382         if (arg_network_bridge)
1383                 memcpy(iface_name, "vb-", 3);
1384         else
1385                 memcpy(iface_name, "ve-", 3);
1386
1387         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1388
1389         r = sd_rtnl_open(&rtnl, 0);
1390         if (r < 0) {
1391                 log_error("Failed to connect to netlink: %s", strerror(-r));
1392                 return r;
1393         }
1394
1395         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1396         if (r < 0) {
1397                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1398                 return r;
1399         }
1400
1401         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1402         if (r < 0) {
1403                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1404                 return r;
1405         }
1406
1407         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1408         if (r < 0) {
1409                 log_error("Failed to open netlink container: %s", strerror(-r));
1410                 return r;
1411         }
1412
1413         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1414         if (r < 0) {
1415                 log_error("Failed to append netlink kind: %s", strerror(-r));
1416                 return r;
1417         }
1418
1419         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1420         if (r < 0) {
1421                 log_error("Failed to open netlink container: %s", strerror(-r));
1422                 return r;
1423         }
1424
1425         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1426         if (r < 0) {
1427                 log_error("Failed to open netlink container: %s", strerror(-r));
1428                 return r;
1429         }
1430
1431         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1432         if (r < 0) {
1433                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1434                 return r;
1435         }
1436
1437         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1438         if (r < 0) {
1439                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1440                 return r;
1441         }
1442
1443         r = sd_rtnl_message_close_container(m);
1444         if (r < 0) {
1445                 log_error("Failed to close netlink container: %s", strerror(-r));
1446                 return r;
1447         }
1448
1449         r = sd_rtnl_message_close_container(m);
1450         if (r < 0) {
1451                 log_error("Failed to close netlink container: %s", strerror(-r));
1452                 return r;
1453         }
1454
1455         r = sd_rtnl_message_close_container(m);
1456         if (r < 0) {
1457                 log_error("Failed to close netlink container: %s", strerror(-r));
1458                 return r;
1459         }
1460
1461         r = sd_rtnl_call(rtnl, m, 0, NULL);
1462         if (r < 0) {
1463                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1464                 return r;
1465         }
1466
1467         return 0;
1468 }
1469
1470 static int setup_bridge(const char veth_name[]) {
1471         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1472         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1473         int r, bridge;
1474
1475         if (!arg_private_network)
1476                 return 0;
1477
1478         if (!arg_network_veth)
1479                 return 0;
1480
1481         if (!arg_network_bridge)
1482                 return 0;
1483
1484         bridge = (int) if_nametoindex(arg_network_bridge);
1485         if (bridge <= 0) {
1486                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1487                 return -errno;
1488         }
1489
1490         r = sd_rtnl_open(&rtnl, 0);
1491         if (r < 0) {
1492                 log_error("Failed to connect to netlink: %s", strerror(-r));
1493                 return r;
1494         }
1495
1496         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1497         if (r < 0) {
1498                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1499                 return r;
1500         }
1501
1502         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1503         if (r < 0) {
1504                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1505                 return r;
1506         }
1507
1508         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1509         if (r < 0) {
1510                 log_error("Failed to add netlink master field: %s", strerror(-r));
1511                 return r;
1512         }
1513
1514         r = sd_rtnl_call(rtnl, m, 0, NULL);
1515         if (r < 0) {
1516                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1517                 return r;
1518         }
1519
1520         return 0;
1521 }
1522
1523 static int move_network_interfaces(pid_t pid) {
1524         _cleanup_udev_unref_ struct udev *udev = NULL;
1525         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1526         char **i;
1527         int r;
1528
1529         if (!arg_private_network)
1530                 return 0;
1531
1532         if (strv_isempty(arg_network_interfaces))
1533                 return 0;
1534
1535         r = sd_rtnl_open(&rtnl, 0);
1536         if (r < 0) {
1537                 log_error("Failed to connect to netlink: %s", strerror(-r));
1538                 return r;
1539         }
1540
1541         udev = udev_new();
1542         if (!udev) {
1543                 log_error("Failed to connect to udev.");
1544                 return -ENOMEM;
1545         }
1546
1547         STRV_FOREACH(i, arg_network_interfaces) {
1548                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1549                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1550                 char ifi_str[2 + DECIMAL_STR_MAX(int)];
1551                 int ifi;
1552
1553                 ifi = (int) if_nametoindex(*i);
1554                 if (ifi <= 0) {
1555                         log_error("Failed to resolve interface %s: %m", *i);
1556                         return -errno;
1557                 }
1558
1559                 sprintf(ifi_str, "n%i", ifi);
1560                 d = udev_device_new_from_device_id(udev, ifi_str);
1561                 if (!d) {
1562                         log_error("Failed to get udev device for interface %s: %m", *i);
1563                         return -errno;
1564                 }
1565
1566                 if (udev_device_get_is_initialized(d) <= 0) {
1567                         log_error("Network interface %s is not initialized yet.", *i);
1568                         return -EBUSY;
1569                 }
1570
1571                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1572                 if (r < 0) {
1573                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1574                         return r;
1575                 }
1576
1577                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1578                 if (r < 0) {
1579                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1580                         return r;
1581                 }
1582
1583                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1584                 if (r < 0) {
1585                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1586                         return r;
1587                 }
1588         }
1589
1590         return 0;
1591 }
1592
1593 static int audit_still_doesnt_work_in_containers(void) {
1594
1595 #ifdef HAVE_SECCOMP
1596         scmp_filter_ctx seccomp;
1597         int r;
1598
1599         /*
1600            Audit is broken in containers, much of the userspace audit
1601            hookup will fail if running inside a container. We don't
1602            care and just turn off creation of audit sockets.
1603
1604            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1605            with EAFNOSUPPORT which audit userspace uses as indication
1606            that audit is disabled in the kernel.
1607          */
1608
1609         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1610         if (!seccomp)
1611                 return log_oom();
1612
1613         r = seccomp_add_secondary_archs(seccomp);
1614         if (r < 0 && r != -EEXIST) {
1615                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1616                 goto finish;
1617         }
1618
1619         r = seccomp_rule_add(
1620                         seccomp,
1621                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1622                         SCMP_SYS(socket),
1623                         2,
1624                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1625                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1626         if (r < 0) {
1627                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1628                 goto finish;
1629         }
1630
1631         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1632         if (r < 0) {
1633                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1634                 goto finish;
1635         }
1636
1637         r = seccomp_load(seccomp);
1638         if (r < 0)
1639                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1640
1641 finish:
1642         seccomp_release(seccomp);
1643         return r;
1644 #else
1645         return 0;
1646 #endif
1647
1648 }
1649
1650 int main(int argc, char *argv[]) {
1651
1652         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1653         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1654         _cleanup_free_ char *kdbus_domain = NULL;
1655         _cleanup_fdset_free_ FDSet *fds = NULL;
1656         const char *console = NULL;
1657         int r = EXIT_FAILURE, k;
1658         int n_fd_passed;
1659         pid_t pid = 0;
1660         sigset_t mask;
1661         char veth_name[IFNAMSIZ];
1662
1663         log_parse_environment();
1664         log_open();
1665
1666         k = parse_argv(argc, argv);
1667         if (k < 0)
1668                 goto finish;
1669         else if (k == 0) {
1670                 r = EXIT_SUCCESS;
1671                 goto finish;
1672         }
1673
1674         if (arg_directory) {
1675                 char *p;
1676
1677                 p = path_make_absolute_cwd(arg_directory);
1678                 free(arg_directory);
1679                 arg_directory = p;
1680         } else
1681                 arg_directory = get_current_dir_name();
1682
1683         if (!arg_directory) {
1684                 log_error("Failed to determine path, please use -D.");
1685                 goto finish;
1686         }
1687
1688         path_kill_slashes(arg_directory);
1689
1690         if (!arg_machine) {
1691                 arg_machine = strdup(basename(arg_directory));
1692                 if (!arg_machine) {
1693                         log_oom();
1694                         goto finish;
1695                 }
1696
1697                 hostname_cleanup(arg_machine, false);
1698                 if (isempty(arg_machine)) {
1699                         log_error("Failed to determine machine name automatically, please use -M.");
1700                         goto finish;
1701                 }
1702         }
1703
1704         if (geteuid() != 0) {
1705                 log_error("Need to be root.");
1706                 goto finish;
1707         }
1708
1709         if (sd_booted() <= 0) {
1710                 log_error("Not running on a systemd system.");
1711                 goto finish;
1712         }
1713
1714         if (path_equal(arg_directory, "/")) {
1715                 log_error("Spawning container on root directory not supported.");
1716                 goto finish;
1717         }
1718
1719         if (arg_boot) {
1720                 if (path_is_os_tree(arg_directory) <= 0) {
1721                         log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1722                         goto finish;
1723                 }
1724         } else {
1725                 const char *p;
1726
1727                 p = strappenda(arg_directory,
1728                                argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1729                 if (access(p, F_OK) < 0) {
1730                         log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1731                         goto finish;
1732
1733                 }
1734         }
1735
1736         log_close();
1737         n_fd_passed = sd_listen_fds(false);
1738         if (n_fd_passed > 0) {
1739                 k = fdset_new_listen_fds(&fds, false);
1740                 if (k < 0) {
1741                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1742                         goto finish;
1743                 }
1744         }
1745         fdset_close_others(fds);
1746         log_open();
1747
1748         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1749         if (master < 0) {
1750                 log_error("Failed to acquire pseudo tty: %m");
1751                 goto finish;
1752         }
1753
1754         console = ptsname(master);
1755         if (!console) {
1756                 log_error("Failed to determine tty name: %m");
1757                 goto finish;
1758         }
1759
1760         if (!arg_quiet)
1761                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1762
1763         if (unlockpt(master) < 0) {
1764                 log_error("Failed to unlock tty: %m");
1765                 goto finish;
1766         }
1767
1768         if (access("/dev/kdbus/control", F_OK) >= 0) {
1769
1770                 if (arg_share_system) {
1771                         kdbus_domain = strdup("/dev/kdbus");
1772                         if (!kdbus_domain) {
1773                                 log_oom();
1774                                 goto finish;
1775                         }
1776                 } else {
1777                         const char *ns;
1778
1779                         ns = strappenda("machine-", arg_machine);
1780                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1781                         if (r < 0)
1782                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1783                         else
1784                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1785                 }
1786         }
1787
1788         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1789                 log_error("Failed to create kmsg socket pair: %m");
1790                 goto finish;
1791         }
1792
1793         sd_notify(0, "READY=1");
1794
1795         assert_se(sigemptyset(&mask) == 0);
1796         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1797         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1798
1799         for (;;) {
1800                 siginfo_t status;
1801
1802                 sync_fd = eventfd(0, EFD_CLOEXEC);
1803                 if (sync_fd < 0) {
1804                         log_error("Failed to create event fd: %m");
1805                         goto finish;
1806                 }
1807
1808                 pid = syscall(__NR_clone,
1809                               SIGCHLD|CLONE_NEWNS|
1810                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1811                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1812                 if (pid < 0) {
1813                         if (errno == EINVAL)
1814                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1815                         else
1816                                 log_error("clone() failed: %m");
1817
1818                         goto finish;
1819                 }
1820
1821                 if (pid == 0) {
1822                         /* child */
1823                         const char *home = NULL;
1824                         uid_t uid = (uid_t) -1;
1825                         gid_t gid = (gid_t) -1;
1826                         unsigned n_env = 2;
1827                         const char *envp[] = {
1828                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1829                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1830                                 NULL, /* TERM */
1831                                 NULL, /* HOME */
1832                                 NULL, /* USER */
1833                                 NULL, /* LOGNAME */
1834                                 NULL, /* container_uuid */
1835                                 NULL, /* LISTEN_FDS */
1836                                 NULL, /* LISTEN_PID */
1837                                 NULL
1838                         };
1839                         char **env_use;
1840                         eventfd_t x;
1841
1842                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1843                         if (envp[n_env])
1844                                 n_env ++;
1845
1846                         close_nointr_nofail(master);
1847                         master = -1;
1848
1849                         close_nointr(STDIN_FILENO);
1850                         close_nointr(STDOUT_FILENO);
1851                         close_nointr(STDERR_FILENO);
1852
1853                         close_nointr_nofail(kmsg_socket_pair[0]);
1854                         kmsg_socket_pair[0] = -1;
1855
1856                         reset_all_signal_handlers();
1857
1858                         assert_se(sigemptyset(&mask) == 0);
1859                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1860
1861                         k = open_terminal(console, O_RDWR);
1862                         if (k != STDIN_FILENO) {
1863                                 if (k >= 0) {
1864                                         close_nointr_nofail(k);
1865                                         k = -EINVAL;
1866                                 }
1867
1868                                 log_error("Failed to open console: %s", strerror(-k));
1869                                 goto child_fail;
1870                         }
1871
1872                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1873                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1874                                 log_error("Failed to duplicate console: %m");
1875                                 goto child_fail;
1876                         }
1877
1878                         if (setsid() < 0) {
1879                                 log_error("setsid() failed: %m");
1880                                 goto child_fail;
1881                         }
1882
1883                         if (reset_audit_loginuid() < 0)
1884                                 goto child_fail;
1885
1886                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1887                                 log_error("PR_SET_PDEATHSIG failed: %m");
1888                                 goto child_fail;
1889                         }
1890
1891                         /* Mark everything as slave, so that we still
1892                          * receive mounts from the real root, but don't
1893                          * propagate mounts to the real root. */
1894                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1895                                 log_error("MS_SLAVE|MS_REC failed: %m");
1896                                 goto child_fail;
1897                         }
1898
1899                         /* Turn directory into bind mount */
1900                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1901                                 log_error("Failed to make bind mount.");
1902                                 goto child_fail;
1903                         }
1904
1905                         if (arg_read_only)
1906                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1907                                         log_error("Failed to make read-only.");
1908                                         goto child_fail;
1909                                 }
1910
1911                         if (mount_all(arg_directory) < 0)
1912                                 goto child_fail;
1913
1914                         if (copy_devnodes(arg_directory) < 0)
1915                                 goto child_fail;
1916
1917                         if (setup_ptmx(arg_directory) < 0)
1918                                 goto child_fail;
1919
1920                         dev_setup(arg_directory);
1921
1922                         if (audit_still_doesnt_work_in_containers() < 0)
1923                                 goto child_fail;
1924
1925                         if (setup_dev_console(arg_directory, console) < 0)
1926                                 goto child_fail;
1927
1928                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1929                                 goto child_fail;
1930
1931                         close_nointr_nofail(kmsg_socket_pair[1]);
1932                         kmsg_socket_pair[1] = -1;
1933
1934                         if (setup_boot_id(arg_directory) < 0)
1935                                 goto child_fail;
1936
1937                         if (setup_timezone(arg_directory) < 0)
1938                                 goto child_fail;
1939
1940                         if (setup_resolv_conf(arg_directory) < 0)
1941                                 goto child_fail;
1942
1943                         if (setup_journal(arg_directory) < 0)
1944                                 goto child_fail;
1945
1946                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1947                                 goto child_fail;
1948
1949                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1950                                 goto child_fail;
1951
1952                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1953                                 goto child_fail;
1954
1955                         if (chdir(arg_directory) < 0) {
1956                                 log_error("chdir(%s) failed: %m", arg_directory);
1957                                 goto child_fail;
1958                         }
1959
1960                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1961                                 log_error("mount(MS_MOVE) failed: %m");
1962                                 goto child_fail;
1963                         }
1964
1965                         if (chroot(".") < 0) {
1966                                 log_error("chroot() failed: %m");
1967                                 goto child_fail;
1968                         }
1969
1970                         if (chdir("/") < 0) {
1971                                 log_error("chdir() failed: %m");
1972                                 goto child_fail;
1973                         }
1974
1975                         umask(0022);
1976
1977                         if (arg_private_network)
1978                                 loopback_setup();
1979
1980                         if (drop_capabilities() < 0) {
1981                                 log_error("drop_capabilities() failed: %m");
1982                                 goto child_fail;
1983                         }
1984
1985                         if (arg_user) {
1986
1987                                 /* Note that this resolves user names
1988                                  * inside the container, and hence
1989                                  * accesses the NSS modules from the
1990                                  * container and not the host. This is
1991                                  * a bit weird... */
1992
1993                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1994                                         log_error("get_user_creds() failed: %m");
1995                                         goto child_fail;
1996                                 }
1997
1998                                 if (mkdir_parents_label(home, 0775) < 0) {
1999                                         log_error("mkdir_parents_label() failed: %m");
2000                                         goto child_fail;
2001                                 }
2002
2003                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
2004                                         log_error("mkdir_safe_label() failed: %m");
2005                                         goto child_fail;
2006                                 }
2007
2008                                 if (initgroups((const char*)arg_user, gid) < 0) {
2009                                         log_error("initgroups() failed: %m");
2010                                         goto child_fail;
2011                                 }
2012
2013                                 if (setresgid(gid, gid, gid) < 0) {
2014                                         log_error("setregid() failed: %m");
2015                                         goto child_fail;
2016                                 }
2017
2018                                 if (setresuid(uid, uid, uid) < 0) {
2019                                         log_error("setreuid() failed: %m");
2020                                         goto child_fail;
2021                                 }
2022                         } else {
2023                                 /* Reset everything fully to 0, just in case */
2024
2025                                 if (setgroups(0, NULL) < 0) {
2026                                         log_error("setgroups() failed: %m");
2027                                         goto child_fail;
2028                                 }
2029
2030                                 if (setresgid(0, 0, 0) < 0) {
2031                                         log_error("setregid() failed: %m");
2032                                         goto child_fail;
2033                                 }
2034
2035                                 if (setresuid(0, 0, 0) < 0) {
2036                                         log_error("setreuid() failed: %m");
2037                                         goto child_fail;
2038                                 }
2039                         }
2040
2041                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2042                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2043                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2044                                 log_oom();
2045                                 goto child_fail;
2046                         }
2047
2048                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2049                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2050                                         log_oom();
2051                                         goto child_fail;
2052                                 }
2053                         }
2054
2055                         if (fdset_size(fds) > 0) {
2056                                 k = fdset_cloexec(fds, false);
2057                                 if (k < 0) {
2058                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2059                                         goto child_fail;
2060                                 }
2061
2062                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2063                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2064                                         log_oom();
2065                                         goto child_fail;
2066                                 }
2067                         }
2068
2069                         setup_hostname();
2070
2071                         if (arg_personality != 0xffffffffLU) {
2072                                 if (personality(arg_personality) < 0) {
2073                                         log_error("personality() failed: %m");
2074                                         goto child_fail;
2075                                 }
2076                         }
2077
2078                         eventfd_read(sync_fd, &x);
2079                         close_nointr_nofail(sync_fd);
2080                         sync_fd = -1;
2081
2082                         if (!strv_isempty(arg_setenv)) {
2083                                 char **n;
2084
2085                                 n = strv_env_merge(2, envp, arg_setenv);
2086                                 if (!n) {
2087                                         log_oom();
2088                                         goto child_fail;
2089                                 }
2090
2091                                 env_use = n;
2092                         } else
2093                                 env_use = (char**) envp;
2094
2095 #ifdef HAVE_SELINUX
2096                         if (arg_selinux_context)
2097                                 if (setexeccon(arg_selinux_context) < 0)
2098                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2099 #endif
2100                         if (arg_boot) {
2101                                 char **a;
2102                                 size_t l;
2103
2104                                 /* Automatically search for the init system */
2105
2106                                 l = 1 + argc - optind;
2107                                 a = newa(char*, l + 1);
2108                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
2109
2110                                 a[0] = (char*) "/usr/lib/systemd/systemd";
2111                                 execve(a[0], a, env_use);
2112
2113                                 a[0] = (char*) "/lib/systemd/systemd";
2114                                 execve(a[0], a, env_use);
2115
2116                                 a[0] = (char*) "/sbin/init";
2117                                 execve(a[0], a, env_use);
2118                         } else if (argc > optind)
2119                                 execvpe(argv[optind], argv + optind, env_use);
2120                         else {
2121                                 chdir(home ? home : "/root");
2122                                 execle("/bin/bash", "-bash", NULL, env_use);
2123                                 execle("/bin/sh", "-sh", NULL, env_use);
2124                         }
2125
2126                         log_error("execv() failed: %m");
2127
2128                 child_fail:
2129                         _exit(EXIT_FAILURE);
2130                 }
2131
2132                 fdset_free(fds);
2133                 fds = NULL;
2134
2135                 r = register_machine(pid);
2136                 if (r < 0)
2137                         goto finish;
2138
2139                 r = move_network_interfaces(pid);
2140                 if (r < 0)
2141                         goto finish;
2142
2143                 r = setup_veth(pid, veth_name);
2144                 if (r < 0)
2145                         goto finish;
2146
2147                 r = setup_bridge(veth_name);
2148                 if (r < 0)
2149                         goto finish;
2150
2151                 eventfd_write(sync_fd, 1);
2152                 close_nointr_nofail(sync_fd);
2153                 sync_fd = -1;
2154
2155                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2156                 if (k < 0) {
2157                         r = EXIT_FAILURE;
2158                         break;
2159                 }
2160
2161                 if (!arg_quiet)
2162                         putc('\n', stdout);
2163
2164                 /* Kill if it is not dead yet anyway */
2165                 terminate_machine(pid);
2166
2167                 /* Redundant, but better safe than sorry */
2168                 kill(pid, SIGKILL);
2169
2170                 k = wait_for_terminate(pid, &status);
2171                 pid = 0;
2172
2173                 if (k < 0) {
2174                         r = EXIT_FAILURE;
2175                         break;
2176                 }
2177
2178                 if (status.si_code == CLD_EXITED) {
2179                         r = status.si_status;
2180                         if (status.si_status != 0) {
2181                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2182                                 break;
2183                         }
2184
2185                         if (!arg_quiet)
2186                                 log_debug("Container %s exited successfully.", arg_machine);
2187                         break;
2188                 } else if (status.si_code == CLD_KILLED &&
2189                            status.si_status == SIGINT) {
2190
2191                         if (!arg_quiet)
2192                                 log_info("Container %s has been shut down.", arg_machine);
2193                         r = 0;
2194                         break;
2195                 } else if (status.si_code == CLD_KILLED &&
2196                            status.si_status == SIGHUP) {
2197
2198                         if (!arg_quiet)
2199                                 log_info("Container %s is being rebooted.", arg_machine);
2200                         continue;
2201                 } else if (status.si_code == CLD_KILLED ||
2202                            status.si_code == CLD_DUMPED) {
2203
2204                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2205                         r = EXIT_FAILURE;
2206                         break;
2207                 } else {
2208                         log_error("Container %s failed due to unknown reason.", arg_machine);
2209                         r = EXIT_FAILURE;
2210                         break;
2211                 }
2212         }
2213
2214 finish:
2215         if (pid > 0)
2216                 kill(pid, SIGKILL);
2217
2218         free(arg_directory);
2219         free(arg_machine);
2220         free(arg_setenv);
2221         free(arg_network_interfaces);
2222
2223         return r;
2224 }