chiark / gitweb /
084929dcdedaf0d94ececaba460bf18d6d420b23
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89
90 #ifdef HAVE_SECCOMP
91 #include "seccomp-util.h"
92 #endif
93
94 typedef enum LinkJournal {
95         LINK_NO,
96         LINK_AUTO,
97         LINK_HOST,
98         LINK_GUEST
99 } LinkJournal;
100
101 static char *arg_directory = NULL;
102 static char *arg_user = NULL;
103 static sd_id128_t arg_uuid = {};
104 static char *arg_machine = NULL;
105 static const char *arg_selinux_context = NULL;
106 static const char *arg_selinux_apifs_context = NULL;
107 static const char *arg_slice = NULL;
108 static bool arg_private_network = false;
109 static bool arg_read_only = false;
110 static bool arg_boot = false;
111 static LinkJournal arg_link_journal = LINK_AUTO;
112 static uint64_t arg_retain =
113         (1ULL << CAP_CHOWN) |
114         (1ULL << CAP_DAC_OVERRIDE) |
115         (1ULL << CAP_DAC_READ_SEARCH) |
116         (1ULL << CAP_FOWNER) |
117         (1ULL << CAP_FSETID) |
118         (1ULL << CAP_IPC_OWNER) |
119         (1ULL << CAP_KILL) |
120         (1ULL << CAP_LEASE) |
121         (1ULL << CAP_LINUX_IMMUTABLE) |
122         (1ULL << CAP_NET_BIND_SERVICE) |
123         (1ULL << CAP_NET_BROADCAST) |
124         (1ULL << CAP_NET_RAW) |
125         (1ULL << CAP_SETGID) |
126         (1ULL << CAP_SETFCAP) |
127         (1ULL << CAP_SETPCAP) |
128         (1ULL << CAP_SETUID) |
129         (1ULL << CAP_SYS_ADMIN) |
130         (1ULL << CAP_SYS_CHROOT) |
131         (1ULL << CAP_SYS_NICE) |
132         (1ULL << CAP_SYS_PTRACE) |
133         (1ULL << CAP_SYS_TTY_CONFIG) |
134         (1ULL << CAP_SYS_RESOURCE) |
135         (1ULL << CAP_SYS_BOOT) |
136         (1ULL << CAP_AUDIT_WRITE) |
137         (1ULL << CAP_AUDIT_CONTROL) |
138         (1ULL << CAP_MKNOD);
139 static char **arg_bind = NULL;
140 static char **arg_bind_ro = NULL;
141 static char **arg_setenv = NULL;
142 static bool arg_quiet = false;
143 static bool arg_share_system = false;
144 static bool arg_register = true;
145 static bool arg_keep_unit = false;
146 static char **arg_network_interfaces = NULL;
147 static char **arg_network_macvlan = NULL;
148 static bool arg_network_veth = false;
149 static const char *arg_network_bridge = NULL;
150 static unsigned long arg_personality = 0xffffffffLU;
151 static const char *arg_image = NULL;
152
153 static int help(void) {
154
155         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
156                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
157                "  -h --help                 Show this help\n"
158                "     --version              Print version string\n"
159                "  -q --quiet                Do not show status information\n"
160                "  -D --directory=PATH       Root directory for the container\n"
161                "  -i --image=PATH           File system device or image for the container\n"
162                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
163                "  -u --user=USER            Run the command under specified user or uid\n"
164                "  -M --machine=NAME         Set the machine name for the container\n"
165                "     --uuid=UUID            Set a specific machine UUID for the container\n"
166                "  -S --slice=SLICE          Place the container in the specified slice\n"
167                "     --private-network      Disable network in container\n"
168                "     --network-interface=INTERFACE\n"
169                "                            Assign an existing network interface to the\n"
170                "                            container\n"
171                "     --network-macvlan=INTERFACE\n"
172                "                            Create a macvlan network interface based on an\n"
173                "                            existing network interface to the container\n"
174                "     --network-veth         Add a virtual ethernet connection between host\n"
175                "                            and container\n"
176                "     --network-bridge=INTERFACE\n"
177                "                            Add a virtual ethernet connection between host\n"
178                "                            and container and add it to an existing bridge on\n"
179                "                            the host\n"
180                "  -Z --selinux-context=SECLABEL\n"
181                "                            Set the SELinux security context to be used by\n"
182                "                            processes in the container\n"
183                "  -L --selinux-apifs-context=SECLABEL\n"
184                "                            Set the SELinux security context to be used by\n"
185                "                            API/tmpfs file systems in the container\n"
186                "     --capability=CAP       In addition to the default, retain specified\n"
187                "                            capability\n"
188                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
189                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
190                "  -j                        Equivalent to --link-journal=host\n"
191                "     --read-only            Mount the root directory read-only\n"
192                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
193                "                            the container\n"
194                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
195                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
196                "     --share-system         Share system namespaces with host\n"
197                "     --register=BOOLEAN     Register container as machine\n"
198                "     --keep-unit            Do not register a scope for the machine, reuse\n"
199                "                            the service unit nspawn is running in\n",
200                program_invocation_short_name);
201
202         return 0;
203 }
204
205 static int parse_argv(int argc, char *argv[]) {
206
207         enum {
208                 ARG_VERSION = 0x100,
209                 ARG_PRIVATE_NETWORK,
210                 ARG_UUID,
211                 ARG_READ_ONLY,
212                 ARG_CAPABILITY,
213                 ARG_DROP_CAPABILITY,
214                 ARG_LINK_JOURNAL,
215                 ARG_BIND,
216                 ARG_BIND_RO,
217                 ARG_SETENV,
218                 ARG_SHARE_SYSTEM,
219                 ARG_REGISTER,
220                 ARG_KEEP_UNIT,
221                 ARG_NETWORK_INTERFACE,
222                 ARG_NETWORK_MACVLAN,
223                 ARG_NETWORK_VETH,
224                 ARG_NETWORK_BRIDGE,
225                 ARG_PERSONALITY,
226         };
227
228         static const struct option options[] = {
229                 { "help",                  no_argument,       NULL, 'h'                   },
230                 { "version",               no_argument,       NULL, ARG_VERSION           },
231                 { "directory",             required_argument, NULL, 'D'                   },
232                 { "user",                  required_argument, NULL, 'u'                   },
233                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
234                 { "boot",                  no_argument,       NULL, 'b'                   },
235                 { "uuid",                  required_argument, NULL, ARG_UUID              },
236                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
237                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
238                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
239                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
240                 { "bind",                  required_argument, NULL, ARG_BIND              },
241                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
242                 { "machine",               required_argument, NULL, 'M'                   },
243                 { "slice",                 required_argument, NULL, 'S'                   },
244                 { "setenv",                required_argument, NULL, ARG_SETENV            },
245                 { "selinux-context",       required_argument, NULL, 'Z'                   },
246                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
247                 { "quiet",                 no_argument,       NULL, 'q'                   },
248                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
249                 { "register",              required_argument, NULL, ARG_REGISTER          },
250                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
251                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
252                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
253                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
254                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
255                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
256                 { "image",                 required_argument, NULL, 'i'                   },
257                 {}
258         };
259
260         int c, r;
261         uint64_t plus = 0, minus = 0;
262
263         assert(argc >= 0);
264         assert(argv);
265
266         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
267
268                 switch (c) {
269
270                 case 'h':
271                         return help();
272
273                 case ARG_VERSION:
274                         puts(PACKAGE_STRING);
275                         puts(SYSTEMD_FEATURES);
276                         return 0;
277
278                 case 'D':
279                         free(arg_directory);
280                         arg_directory = canonicalize_file_name(optarg);
281                         if (!arg_directory) {
282                                 log_error("Invalid root directory: %m");
283                                 return -ENOMEM;
284                         }
285
286                         break;
287
288                 case 'i':
289                         arg_image = optarg;
290                         break;
291
292                 case 'u':
293                         free(arg_user);
294                         arg_user = strdup(optarg);
295                         if (!arg_user)
296                                 return log_oom();
297
298                         break;
299
300                 case ARG_NETWORK_BRIDGE:
301                         arg_network_bridge = optarg;
302
303                         /* fall through */
304
305                 case ARG_NETWORK_VETH:
306                         arg_network_veth = true;
307                         arg_private_network = true;
308                         break;
309
310                 case ARG_NETWORK_INTERFACE:
311                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
312                                 return log_oom();
313
314                         arg_private_network = true;
315                         break;
316
317                 case ARG_NETWORK_MACVLAN:
318                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
319                                 return log_oom();
320
321                         /* fall through */
322
323                 case ARG_PRIVATE_NETWORK:
324                         arg_private_network = true;
325                         break;
326
327                 case 'b':
328                         arg_boot = true;
329                         break;
330
331                 case ARG_UUID:
332                         r = sd_id128_from_string(optarg, &arg_uuid);
333                         if (r < 0) {
334                                 log_error("Invalid UUID: %s", optarg);
335                                 return r;
336                         }
337                         break;
338
339                 case 'S':
340                         arg_slice = optarg;
341                         break;
342
343                 case 'M':
344                         if (isempty(optarg)) {
345                                 free(arg_machine);
346                                 arg_machine = NULL;
347                         } else {
348
349                                 if (!hostname_is_valid(optarg)) {
350                                         log_error("Invalid machine name: %s", optarg);
351                                         return -EINVAL;
352                                 }
353
354                                 free(arg_machine);
355                                 arg_machine = strdup(optarg);
356                                 if (!arg_machine)
357                                         return log_oom();
358
359                                 break;
360                         }
361
362                 case 'Z':
363                         arg_selinux_context = optarg;
364                         break;
365
366                 case 'L':
367                         arg_selinux_apifs_context = optarg;
368                         break;
369
370                 case ARG_READ_ONLY:
371                         arg_read_only = true;
372                         break;
373
374                 case ARG_CAPABILITY:
375                 case ARG_DROP_CAPABILITY: {
376                         char *state, *word;
377                         size_t length;
378
379                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
380                                 _cleanup_free_ char *t;
381                                 cap_value_t cap;
382
383                                 t = strndup(word, length);
384                                 if (!t)
385                                         return log_oom();
386
387                                 if (streq(t, "all")) {
388                                         if (c == ARG_CAPABILITY)
389                                                 plus = (uint64_t) -1;
390                                         else
391                                                 minus = (uint64_t) -1;
392                                 } else {
393                                         if (cap_from_name(t, &cap) < 0) {
394                                                 log_error("Failed to parse capability %s.", t);
395                                                 return -EINVAL;
396                                         }
397
398                                         if (c == ARG_CAPABILITY)
399                                                 plus |= 1ULL << (uint64_t) cap;
400                                         else
401                                                 minus |= 1ULL << (uint64_t) cap;
402                                 }
403                         }
404
405                         break;
406                 }
407
408                 case 'j':
409                         arg_link_journal = LINK_GUEST;
410                         break;
411
412                 case ARG_LINK_JOURNAL:
413                         if (streq(optarg, "auto"))
414                                 arg_link_journal = LINK_AUTO;
415                         else if (streq(optarg, "no"))
416                                 arg_link_journal = LINK_NO;
417                         else if (streq(optarg, "guest"))
418                                 arg_link_journal = LINK_GUEST;
419                         else if (streq(optarg, "host"))
420                                 arg_link_journal = LINK_HOST;
421                         else {
422                                 log_error("Failed to parse link journal mode %s", optarg);
423                                 return -EINVAL;
424                         }
425
426                         break;
427
428                 case ARG_BIND:
429                 case ARG_BIND_RO: {
430                         _cleanup_free_ char *a = NULL, *b = NULL;
431                         char *e;
432                         char ***x;
433
434                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
435
436                         e = strchr(optarg, ':');
437                         if (e) {
438                                 a = strndup(optarg, e - optarg);
439                                 b = strdup(e + 1);
440                         } else {
441                                 a = strdup(optarg);
442                                 b = strdup(optarg);
443                         }
444
445                         if (!a || !b)
446                                 return log_oom();
447
448                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
449                                 log_error("Invalid bind mount specification: %s", optarg);
450                                 return -EINVAL;
451                         }
452
453                         r = strv_extend(x, a);
454                         if (r < 0)
455                                 return log_oom();
456
457                         r = strv_extend(x, b);
458                         if (r < 0)
459                                 return log_oom();
460
461                         break;
462                 }
463
464                 case ARG_SETENV: {
465                         char **n;
466
467                         if (!env_assignment_is_valid(optarg)) {
468                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
469                                 return -EINVAL;
470                         }
471
472                         n = strv_env_set(arg_setenv, optarg);
473                         if (!n)
474                                 return log_oom();
475
476                         strv_free(arg_setenv);
477                         arg_setenv = n;
478                         break;
479                 }
480
481                 case 'q':
482                         arg_quiet = true;
483                         break;
484
485                 case ARG_SHARE_SYSTEM:
486                         arg_share_system = true;
487                         break;
488
489                 case ARG_REGISTER:
490                         r = parse_boolean(optarg);
491                         if (r < 0) {
492                                 log_error("Failed to parse --register= argument: %s", optarg);
493                                 return r;
494                         }
495
496                         arg_register = r;
497                         break;
498
499                 case ARG_KEEP_UNIT:
500                         arg_keep_unit = true;
501                         break;
502
503                 case ARG_PERSONALITY:
504
505                         arg_personality = personality_from_string(optarg);
506                         if (arg_personality == 0xffffffffLU) {
507                                 log_error("Unknown or unsupported personality '%s'.", optarg);
508                                 return -EINVAL;
509                         }
510
511                         break;
512
513                 case '?':
514                         return -EINVAL;
515
516                 default:
517                         assert_not_reached("Unhandled option");
518                 }
519         }
520
521         if (arg_share_system)
522                 arg_register = false;
523
524         if (arg_boot && arg_share_system) {
525                 log_error("--boot and --share-system may not be combined.");
526                 return -EINVAL;
527         }
528
529         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
530                 log_error("--keep-unit may not be used when invoked from a user session.");
531                 return -EINVAL;
532         }
533
534         if (arg_directory && arg_image) {
535                 log_error("--directory= and --image= may not be combined.");
536                 return -EINVAL;
537         }
538
539         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
540
541         return 1;
542 }
543
544 static int mount_all(const char *dest) {
545
546         typedef struct MountPoint {
547                 const char *what;
548                 const char *where;
549                 const char *type;
550                 const char *options;
551                 unsigned long flags;
552                 bool fatal;
553         } MountPoint;
554
555         static const MountPoint mount_table[] = {
556                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
557                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
558                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
559                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
560                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
561                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
562                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
563                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564 #ifdef HAVE_SELINUX
565                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
566                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
567 #endif
568         };
569
570         unsigned k;
571         int r = 0;
572
573         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
574                 _cleanup_free_ char *where = NULL;
575 #ifdef HAVE_SELINUX
576                 _cleanup_free_ char *options = NULL;
577 #endif
578                 const char *o;
579                 int t;
580
581                 where = strjoin(dest, "/", mount_table[k].where, NULL);
582                 if (!where)
583                         return log_oom();
584
585                 t = path_is_mount_point(where, true);
586                 if (t < 0) {
587                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
588
589                         if (r == 0)
590                                 r = t;
591
592                         continue;
593                 }
594
595                 /* Skip this entry if it is not a remount. */
596                 if (mount_table[k].what && t > 0)
597                         continue;
598
599                 mkdir_p(where, 0755);
600
601 #ifdef HAVE_SELINUX
602                 if (arg_selinux_apifs_context &&
603                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
604                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
605                         if (!options)
606                                 return log_oom();
607
608                         o = options;
609                 } else
610 #endif
611                         o = mount_table[k].options;
612
613
614                 if (mount(mount_table[k].what,
615                           where,
616                           mount_table[k].type,
617                           mount_table[k].flags,
618                           o) < 0 &&
619                     mount_table[k].fatal) {
620
621                         log_error("mount(%s) failed: %m", where);
622
623                         if (r == 0)
624                                 r = -errno;
625                 }
626         }
627
628         return r;
629 }
630
631 static int mount_binds(const char *dest, char **l, unsigned long flags) {
632         char **x, **y;
633
634         STRV_FOREACH_PAIR(x, y, l) {
635                 char *where;
636                 struct stat source_st, dest_st;
637                 int r;
638
639                 if (stat(*x, &source_st) < 0) {
640                         log_error("Failed to stat %s: %m", *x);
641                         return -errno;
642                 }
643
644                 where = strappenda(dest, *y);
645                 r = stat(where, &dest_st);
646                 if (r == 0) {
647                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
648                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
649                                                 *x, where);
650                                 return -EINVAL;
651                         }
652                 } else if (errno == ENOENT) {
653                         r = mkdir_parents_label(where, 0755);
654                         if (r < 0) {
655                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
656                                 return r;
657                         }
658                 } else {
659                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
660                         return -errno;
661                 }
662                 /* Create the mount point, but be conservative -- refuse to create block
663                 * and char devices. */
664                 if (S_ISDIR(source_st.st_mode))
665                         mkdir_label(where, 0755);
666                 else if (S_ISFIFO(source_st.st_mode))
667                         mkfifo(where, 0644);
668                 else if (S_ISSOCK(source_st.st_mode))
669                         mknod(where, 0644 | S_IFSOCK, 0);
670                 else if (S_ISREG(source_st.st_mode))
671                         touch(where);
672                 else {
673                         log_error("Refusing to create mountpoint for file: %s", *x);
674                         return -ENOTSUP;
675                 }
676
677                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
678                         log_error("mount(%s) failed: %m", where);
679                         return -errno;
680                 }
681
682                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
683                         log_error("mount(%s) failed: %m", where);
684                         return -errno;
685                 }
686         }
687
688         return 0;
689 }
690
691 static int setup_timezone(const char *dest) {
692         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
693         char *z, *y;
694         int r;
695
696         assert(dest);
697
698         /* Fix the timezone, if possible */
699         r = readlink_malloc("/etc/localtime", &p);
700         if (r < 0) {
701                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
702                 return 0;
703         }
704
705         z = path_startswith(p, "../usr/share/zoneinfo/");
706         if (!z)
707                 z = path_startswith(p, "/usr/share/zoneinfo/");
708         if (!z) {
709                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
710                 return 0;
711         }
712
713         where = strappend(dest, "/etc/localtime");
714         if (!where)
715                 return log_oom();
716
717         r = readlink_malloc(where, &q);
718         if (r >= 0) {
719                 y = path_startswith(q, "../usr/share/zoneinfo/");
720                 if (!y)
721                         y = path_startswith(q, "/usr/share/zoneinfo/");
722
723
724                 /* Already pointing to the right place? Then do nothing .. */
725                 if (y && streq(y, z))
726                         return 0;
727         }
728
729         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
730         if (!check)
731                 return log_oom();
732
733         if (access(check, F_OK) < 0) {
734                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
735                 return 0;
736         }
737
738         what = strappend("../usr/share/zoneinfo/", z);
739         if (!what)
740                 return log_oom();
741
742         unlink(where);
743         if (symlink(what, where) < 0) {
744                 log_error("Failed to correct timezone of container: %m");
745                 return 0;
746         }
747
748         return 0;
749 }
750
751 static int setup_resolv_conf(const char *dest) {
752         char _cleanup_free_ *where = NULL;
753
754         assert(dest);
755
756         if (arg_private_network)
757                 return 0;
758
759         /* Fix resolv.conf, if possible */
760         where = strappend(dest, "/etc/resolv.conf");
761         if (!where)
762                 return log_oom();
763
764         /* We don't really care for the results of this really. If it
765          * fails, it fails, but meh... */
766         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
767
768         return 0;
769 }
770
771 static int setup_boot_id(const char *dest) {
772         _cleanup_free_ char *from = NULL, *to = NULL;
773         sd_id128_t rnd = {};
774         char as_uuid[37];
775         int r;
776
777         assert(dest);
778
779         if (arg_share_system)
780                 return 0;
781
782         /* Generate a new randomized boot ID, so that each boot-up of
783          * the container gets a new one */
784
785         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
786         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
787         if (!from || !to)
788                 return log_oom();
789
790         r = sd_id128_randomize(&rnd);
791         if (r < 0) {
792                 log_error("Failed to generate random boot id: %s", strerror(-r));
793                 return r;
794         }
795
796         snprintf(as_uuid, sizeof(as_uuid),
797                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
798                  SD_ID128_FORMAT_VAL(rnd));
799         char_array_0(as_uuid);
800
801         r = write_string_file(from, as_uuid);
802         if (r < 0) {
803                 log_error("Failed to write boot id: %s", strerror(-r));
804                 return r;
805         }
806
807         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
808                 log_error("Failed to bind mount boot id: %m");
809                 r = -errno;
810         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
811                 log_warning("Failed to make boot id read-only: %m");
812
813         unlink(from);
814         return r;
815 }
816
817 static int copy_devnodes(const char *dest) {
818
819         static const char devnodes[] =
820                 "null\0"
821                 "zero\0"
822                 "full\0"
823                 "random\0"
824                 "urandom\0"
825                 "tty\0";
826
827         const char *d;
828         int r = 0;
829         _cleanup_umask_ mode_t u;
830
831         assert(dest);
832
833         u = umask(0000);
834
835         NULSTR_FOREACH(d, devnodes) {
836                 _cleanup_free_ char *from = NULL, *to = NULL;
837                 struct stat st;
838
839                 from = strappend("/dev/", d);
840                 to = strjoin(dest, "/dev/", d, NULL);
841                 if (!from || !to)
842                         return log_oom();
843
844                 if (stat(from, &st) < 0) {
845
846                         if (errno != ENOENT) {
847                                 log_error("Failed to stat %s: %m", from);
848                                 return -errno;
849                         }
850
851                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
852
853                         log_error("%s is not a char or block device, cannot copy", from);
854                         return -EIO;
855
856                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
857
858                         log_error("mknod(%s) failed: %m", dest);
859                         return  -errno;
860                 }
861         }
862
863         return r;
864 }
865
866 static int setup_ptmx(const char *dest) {
867         _cleanup_free_ char *p = NULL;
868
869         p = strappend(dest, "/dev/ptmx");
870         if (!p)
871                 return log_oom();
872
873         if (symlink("pts/ptmx", p) < 0) {
874                 log_error("Failed to create /dev/ptmx symlink: %m");
875                 return -errno;
876         }
877
878         return 0;
879 }
880
881 static int setup_dev_console(const char *dest, const char *console) {
882         _cleanup_umask_ mode_t u;
883         const char *to;
884         struct stat st;
885         int r;
886
887         assert(dest);
888         assert(console);
889
890         u = umask(0000);
891
892         if (stat("/dev/null", &st) < 0) {
893                 log_error("Failed to stat /dev/null: %m");
894                 return -errno;
895         }
896
897         r = chmod_and_chown(console, 0600, 0, 0);
898         if (r < 0) {
899                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
900                 return r;
901         }
902
903         /* We need to bind mount the right tty to /dev/console since
904          * ptys can only exist on pts file systems. To have something
905          * to bind mount things on we create a device node first, and
906          * use /dev/null for that since we the cgroups device policy
907          * allows us to create that freely, while we cannot create
908          * /dev/console. (Note that the major minor doesn't actually
909          * matter here, since we mount it over anyway). */
910
911         to = strappenda(dest, "/dev/console");
912         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
913                 log_error("mknod() for /dev/console failed: %m");
914                 return -errno;
915         }
916
917         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
918                 log_error("Bind mount for /dev/console failed: %m");
919                 return -errno;
920         }
921
922         return 0;
923 }
924
925 static int setup_kmsg(const char *dest, int kmsg_socket) {
926         _cleanup_free_ char *from = NULL, *to = NULL;
927         int r, fd, k;
928         _cleanup_umask_ mode_t u;
929         union {
930                 struct cmsghdr cmsghdr;
931                 uint8_t buf[CMSG_SPACE(sizeof(int))];
932         } control = {};
933         struct msghdr mh = {
934                 .msg_control = &control,
935                 .msg_controllen = sizeof(control),
936         };
937         struct cmsghdr *cmsg;
938
939         assert(dest);
940         assert(kmsg_socket >= 0);
941
942         u = umask(0000);
943
944         /* We create the kmsg FIFO as /dev/kmsg, but immediately
945          * delete it after bind mounting it to /proc/kmsg. While FIFOs
946          * on the reading side behave very similar to /proc/kmsg,
947          * their writing side behaves differently from /dev/kmsg in
948          * that writing blocks when nothing is reading. In order to
949          * avoid any problems with containers deadlocking due to this
950          * we simply make /dev/kmsg unavailable to the container. */
951         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
952             asprintf(&to, "%s/proc/kmsg", dest) < 0)
953                 return log_oom();
954
955         if (mkfifo(from, 0600) < 0) {
956                 log_error("mkfifo() for /dev/kmsg failed: %m");
957                 return -errno;
958         }
959
960         r = chmod_and_chown(from, 0600, 0, 0);
961         if (r < 0) {
962                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
963                 return r;
964         }
965
966         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
967                 log_error("Bind mount for /proc/kmsg failed: %m");
968                 return -errno;
969         }
970
971         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
972         if (fd < 0) {
973                 log_error("Failed to open fifo: %m");
974                 return -errno;
975         }
976
977         cmsg = CMSG_FIRSTHDR(&mh);
978         cmsg->cmsg_level = SOL_SOCKET;
979         cmsg->cmsg_type = SCM_RIGHTS;
980         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
981         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
982
983         mh.msg_controllen = cmsg->cmsg_len;
984
985         /* Store away the fd in the socket, so that it stays open as
986          * long as we run the child */
987         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
988         close_nointr_nofail(fd);
989
990         if (k < 0) {
991                 log_error("Failed to send FIFO fd: %m");
992                 return -errno;
993         }
994
995         /* And now make the FIFO unavailable as /dev/kmsg... */
996         unlink(from);
997         return 0;
998 }
999
1000 static int setup_hostname(void) {
1001
1002         if (arg_share_system)
1003                 return 0;
1004
1005         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1006                 return -errno;
1007
1008         return 0;
1009 }
1010
1011 static int setup_journal(const char *directory) {
1012         sd_id128_t machine_id, this_id;
1013         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1014         char *id;
1015         int r;
1016
1017         p = strappend(directory, "/etc/machine-id");
1018         if (!p)
1019                 return log_oom();
1020
1021         r = read_one_line_file(p, &b);
1022         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1023                 return 0;
1024         else if (r < 0) {
1025                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1026                 return r;
1027         }
1028
1029         id = strstrip(b);
1030         if (isempty(id) && arg_link_journal == LINK_AUTO)
1031                 return 0;
1032
1033         /* Verify validity */
1034         r = sd_id128_from_string(id, &machine_id);
1035         if (r < 0) {
1036                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1037                 return r;
1038         }
1039
1040         r = sd_id128_get_machine(&this_id);
1041         if (r < 0) {
1042                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1043                 return r;
1044         }
1045
1046         if (sd_id128_equal(machine_id, this_id)) {
1047                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1048                          "Host and machine ids are equal (%s): refusing to link journals", id);
1049                 if (arg_link_journal == LINK_AUTO)
1050                         return 0;
1051                 return
1052                         -EEXIST;
1053         }
1054
1055         if (arg_link_journal == LINK_NO)
1056                 return 0;
1057
1058         free(p);
1059         p = strappend("/var/log/journal/", id);
1060         q = strjoin(directory, "/var/log/journal/", id, NULL);
1061         if (!p || !q)
1062                 return log_oom();
1063
1064         if (path_is_mount_point(p, false) > 0) {
1065                 if (arg_link_journal != LINK_AUTO) {
1066                         log_error("%s: already a mount point, refusing to use for journal", p);
1067                         return -EEXIST;
1068                 }
1069
1070                 return 0;
1071         }
1072
1073         if (path_is_mount_point(q, false) > 0) {
1074                 if (arg_link_journal != LINK_AUTO) {
1075                         log_error("%s: already a mount point, refusing to use for journal", q);
1076                         return -EEXIST;
1077                 }
1078
1079                 return 0;
1080         }
1081
1082         r = readlink_and_make_absolute(p, &d);
1083         if (r >= 0) {
1084                 if ((arg_link_journal == LINK_GUEST ||
1085                      arg_link_journal == LINK_AUTO) &&
1086                     path_equal(d, q)) {
1087
1088                         r = mkdir_p(q, 0755);
1089                         if (r < 0)
1090                                 log_warning("failed to create directory %s: %m", q);
1091                         return 0;
1092                 }
1093
1094                 if (unlink(p) < 0) {
1095                         log_error("Failed to remove symlink %s: %m", p);
1096                         return -errno;
1097                 }
1098         } else if (r == -EINVAL) {
1099
1100                 if (arg_link_journal == LINK_GUEST &&
1101                     rmdir(p) < 0) {
1102
1103                         if (errno == ENOTDIR) {
1104                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1105                                 return r;
1106                         } else {
1107                                 log_error("Failed to remove %s: %m", p);
1108                                 return -errno;
1109                         }
1110                 }
1111         } else if (r != -ENOENT) {
1112                 log_error("readlink(%s) failed: %m", p);
1113                 return r;
1114         }
1115
1116         if (arg_link_journal == LINK_GUEST) {
1117
1118                 if (symlink(q, p) < 0) {
1119                         log_error("Failed to symlink %s to %s: %m", q, p);
1120                         return -errno;
1121                 }
1122
1123                 r = mkdir_p(q, 0755);
1124                 if (r < 0)
1125                         log_warning("failed to create directory %s: %m", q);
1126                 return 0;
1127         }
1128
1129         if (arg_link_journal == LINK_HOST) {
1130                 r = mkdir_p(p, 0755);
1131                 if (r < 0) {
1132                         log_error("Failed to create %s: %m", p);
1133                         return r;
1134                 }
1135
1136         } else if (access(p, F_OK) < 0)
1137                 return 0;
1138
1139         if (dir_is_empty(q) == 0) {
1140                 log_error("%s not empty.", q);
1141                 return -ENOTEMPTY;
1142         }
1143
1144         r = mkdir_p(q, 0755);
1145         if (r < 0) {
1146                 log_error("Failed to create %s: %m", q);
1147                 return r;
1148         }
1149
1150         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1151                 log_error("Failed to bind mount journal from host into guest: %m");
1152                 return -errno;
1153         }
1154
1155         return 0;
1156 }
1157
1158 static int setup_kdbus(const char *dest, const char *path) {
1159         const char *p;
1160
1161         if (!path)
1162                 return 0;
1163
1164         p = strappenda(dest, "/dev/kdbus");
1165         if (mkdir(p, 0755) < 0) {
1166                 log_error("Failed to create kdbus path: %m");
1167                 return  -errno;
1168         }
1169
1170         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1171                 log_error("Failed to mount kdbus domain path: %m");
1172                 return -errno;
1173         }
1174
1175         return 0;
1176 }
1177
1178 static int drop_capabilities(void) {
1179         return capability_bounding_set_drop(~arg_retain, false);
1180 }
1181
1182 static int register_machine(pid_t pid) {
1183         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184         _cleanup_bus_unref_ sd_bus *bus = NULL;
1185         int r;
1186
1187         if (!arg_register)
1188                 return 0;
1189
1190         r = sd_bus_default_system(&bus);
1191         if (r < 0) {
1192                 log_error("Failed to open system bus: %s", strerror(-r));
1193                 return r;
1194         }
1195
1196         if (arg_keep_unit) {
1197                 r = sd_bus_call_method(
1198                                 bus,
1199                                 "org.freedesktop.machine1",
1200                                 "/org/freedesktop/machine1",
1201                                 "org.freedesktop.machine1.Manager",
1202                                 "RegisterMachine",
1203                                 &error,
1204                                 NULL,
1205                                 "sayssus",
1206                                 arg_machine,
1207                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1208                                 "nspawn",
1209                                 "container",
1210                                 (uint32_t) pid,
1211                                 strempty(arg_directory));
1212         } else {
1213                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1214
1215                 r = sd_bus_message_new_method_call(
1216                                 bus,
1217                                 &m,
1218                                 "org.freedesktop.machine1",
1219                                 "/org/freedesktop/machine1",
1220                                 "org.freedesktop.machine1.Manager",
1221                                 "CreateMachine");
1222                 if (r < 0) {
1223                         log_error("Failed to create message: %s", strerror(-r));
1224                         return r;
1225                 }
1226
1227                 r = sd_bus_message_append(
1228                                 m,
1229                                 "sayssus",
1230                                 arg_machine,
1231                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1232                                 "nspawn",
1233                                 "container",
1234                                 (uint32_t) pid,
1235                                 strempty(arg_directory));
1236                 if (r < 0) {
1237                         log_error("Failed to append message arguments: %s", strerror(-r));
1238                         return r;
1239                 }
1240
1241                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1242                 if (r < 0) {
1243                         log_error("Failed to open container: %s", strerror(-r));
1244                         return r;
1245                 }
1246
1247                 if (!isempty(arg_slice)) {
1248                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1249                         if (r < 0) {
1250                                 log_error("Failed to append slice: %s", strerror(-r));
1251                                 return r;
1252                         }
1253                 }
1254
1255                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1256                 if (r < 0) {
1257                         log_error("Failed to add device policy: %s", strerror(-r));
1258                         return r;
1259                 }
1260
1261                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1262                                           /* Allow the container to
1263                                            * access and create the API
1264                                            * device nodes, so that
1265                                            * PrivateDevices= in the
1266                                            * container can work
1267                                            * fine */
1268                                           "/dev/null", "rwm",
1269                                           "/dev/zero", "rwm",
1270                                           "/dev/full", "rwm",
1271                                           "/dev/random", "rwm",
1272                                           "/dev/urandom", "rwm",
1273                                           "/dev/tty", "rwm",
1274                                           /* Allow the container
1275                                            * access to ptys. However,
1276                                            * do not permit the
1277                                            * container to ever create
1278                                            * these device nodes. */
1279                                           "/dev/pts/ptmx", "rw",
1280                                           "char-pts", "rw",
1281                                           /* Allow the container
1282                                            * access to all kdbus
1283                                            * devices. Again, the
1284                                            * container cannot create
1285                                            * these nodes, only use
1286                                            * them. We use a pretty
1287                                            * open match here, so that
1288                                            * the kernel API can still
1289                                            * change. */
1290                                           "char-kdbus", "rw",
1291                                           "char-kdbus/*", "rw");
1292                 if (r < 0) {
1293                         log_error("Failed to add device whitelist: %s", strerror(-r));
1294                         return r;
1295                 }
1296
1297                 r = sd_bus_message_close_container(m);
1298                 if (r < 0) {
1299                         log_error("Failed to close container: %s", strerror(-r));
1300                         return r;
1301                 }
1302
1303                 r = sd_bus_call(bus, m, 0, &error, NULL);
1304         }
1305
1306         if (r < 0) {
1307                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1308                 return r;
1309         }
1310
1311         return 0;
1312 }
1313
1314 static int terminate_machine(pid_t pid) {
1315         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1316         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1317         _cleanup_bus_unref_ sd_bus *bus = NULL;
1318         const char *path;
1319         int r;
1320
1321         if (!arg_register)
1322                 return 0;
1323
1324         r = sd_bus_default_system(&bus);
1325         if (r < 0) {
1326                 log_error("Failed to open system bus: %s", strerror(-r));
1327                 return r;
1328         }
1329
1330         r = sd_bus_call_method(
1331                         bus,
1332                         "org.freedesktop.machine1",
1333                         "/org/freedesktop/machine1",
1334                         "org.freedesktop.machine1.Manager",
1335                         "GetMachineByPID",
1336                         &error,
1337                         &reply,
1338                         "u",
1339                         (uint32_t) pid);
1340         if (r < 0) {
1341                 /* Note that the machine might already have been
1342                  * cleaned up automatically, hence don't consider it a
1343                  * failure if we cannot get the machine object. */
1344                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1345                 return 0;
1346         }
1347
1348         r = sd_bus_message_read(reply, "o", &path);
1349         if (r < 0)
1350                 return bus_log_parse_error(r);
1351
1352         r = sd_bus_call_method(
1353                         bus,
1354                         "org.freedesktop.machine1",
1355                         path,
1356                         "org.freedesktop.machine1.Machine",
1357                         "Terminate",
1358                         &error,
1359                         NULL,
1360                         NULL);
1361         if (r < 0) {
1362                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1363                 return 0;
1364         }
1365
1366         return 0;
1367 }
1368
1369 static int reset_audit_loginuid(void) {
1370         _cleanup_free_ char *p = NULL;
1371         int r;
1372
1373         if (arg_share_system)
1374                 return 0;
1375
1376         r = read_one_line_file("/proc/self/loginuid", &p);
1377         if (r == -ENOENT)
1378                 return 0;
1379         if (r < 0) {
1380                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1381                 return r;
1382         }
1383
1384         /* Already reset? */
1385         if (streq(p, "4294967295"))
1386                 return 0;
1387
1388         r = write_string_file("/proc/self/loginuid", "4294967295");
1389         if (r < 0) {
1390                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1391                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1392                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1393                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1394                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1395
1396                 sleep(5);
1397         }
1398
1399         return 0;
1400 }
1401
1402 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1403         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1404         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1405         int r;
1406
1407         if (!arg_private_network)
1408                 return 0;
1409
1410         if (!arg_network_veth)
1411                 return 0;
1412
1413         /* Use two different interface name prefixes depending whether
1414          * we are in bridge mode or not. */
1415         if (arg_network_bridge)
1416                 memcpy(iface_name, "vb-", 3);
1417         else
1418                 memcpy(iface_name, "ve-", 3);
1419
1420         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1421
1422         r = sd_rtnl_open(&rtnl, 0);
1423         if (r < 0) {
1424                 log_error("Failed to connect to netlink: %s", strerror(-r));
1425                 return r;
1426         }
1427
1428         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1429         if (r < 0) {
1430                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1431                 return r;
1432         }
1433
1434         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1435         if (r < 0) {
1436                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1437                 return r;
1438         }
1439
1440         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1441         if (r < 0) {
1442                 log_error("Failed to open netlink container: %s", strerror(-r));
1443                 return r;
1444         }
1445
1446         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1447         if (r < 0) {
1448                 log_error("Failed to append netlink kind: %s", strerror(-r));
1449                 return r;
1450         }
1451
1452         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1453         if (r < 0) {
1454                 log_error("Failed to open netlink container: %s", strerror(-r));
1455                 return r;
1456         }
1457
1458         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1459         if (r < 0) {
1460                 log_error("Failed to open netlink container: %s", strerror(-r));
1461                 return r;
1462         }
1463
1464         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1465         if (r < 0) {
1466                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1467                 return r;
1468         }
1469
1470         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1471         if (r < 0) {
1472                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1473                 return r;
1474         }
1475
1476         r = sd_rtnl_message_close_container(m);
1477         if (r < 0) {
1478                 log_error("Failed to close netlink container: %s", strerror(-r));
1479                 return r;
1480         }
1481
1482         r = sd_rtnl_message_close_container(m);
1483         if (r < 0) {
1484                 log_error("Failed to close netlink container: %s", strerror(-r));
1485                 return r;
1486         }
1487
1488         r = sd_rtnl_message_close_container(m);
1489         if (r < 0) {
1490                 log_error("Failed to close netlink container: %s", strerror(-r));
1491                 return r;
1492         }
1493
1494         r = sd_rtnl_call(rtnl, m, 0, NULL);
1495         if (r < 0) {
1496                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1497                 return r;
1498         }
1499
1500         return 0;
1501 }
1502
1503 static int setup_bridge(const char veth_name[]) {
1504         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1505         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1506         int r, bridge;
1507
1508         if (!arg_private_network)
1509                 return 0;
1510
1511         if (!arg_network_veth)
1512                 return 0;
1513
1514         if (!arg_network_bridge)
1515                 return 0;
1516
1517         bridge = (int) if_nametoindex(arg_network_bridge);
1518         if (bridge <= 0) {
1519                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1520                 return -errno;
1521         }
1522
1523         r = sd_rtnl_open(&rtnl, 0);
1524         if (r < 0) {
1525                 log_error("Failed to connect to netlink: %s", strerror(-r));
1526                 return r;
1527         }
1528
1529         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1530         if (r < 0) {
1531                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1532                 return r;
1533         }
1534
1535         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1536         if (r < 0) {
1537                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1538                 return r;
1539         }
1540
1541         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1542         if (r < 0) {
1543                 log_error("Failed to add netlink master field: %s", strerror(-r));
1544                 return r;
1545         }
1546
1547         r = sd_rtnl_call(rtnl, m, 0, NULL);
1548         if (r < 0) {
1549                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1550                 return r;
1551         }
1552
1553         return 0;
1554 }
1555
1556 static int parse_interface(struct udev *udev, const char *name) {
1557         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1558         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1559         int ifi;
1560
1561         ifi = (int) if_nametoindex(name);
1562         if (ifi <= 0) {
1563                 log_error("Failed to resolve interface %s: %m", name);
1564                 return -errno;
1565         }
1566
1567         sprintf(ifi_str, "n%i", ifi);
1568         d = udev_device_new_from_device_id(udev, ifi_str);
1569         if (!d) {
1570                 log_error("Failed to get udev device for interface %s: %m", name);
1571                 return -errno;
1572         }
1573
1574         if (udev_device_get_is_initialized(d) <= 0) {
1575                 log_error("Network interface %s is not initialized yet.", name);
1576                 return -EBUSY;
1577         }
1578
1579         return ifi;
1580 }
1581
1582 static int move_network_interfaces(pid_t pid) {
1583         _cleanup_udev_unref_ struct udev *udev = NULL;
1584         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1585         char **i;
1586         int r;
1587
1588         if (!arg_private_network)
1589                 return 0;
1590
1591         if (strv_isempty(arg_network_interfaces))
1592                 return 0;
1593
1594         r = sd_rtnl_open(&rtnl, 0);
1595         if (r < 0) {
1596                 log_error("Failed to connect to netlink: %s", strerror(-r));
1597                 return r;
1598         }
1599
1600         udev = udev_new();
1601         if (!udev) {
1602                 log_error("Failed to connect to udev.");
1603                 return -ENOMEM;
1604         }
1605
1606         STRV_FOREACH(i, arg_network_interfaces) {
1607                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1608                 int ifi;
1609
1610                 ifi = parse_interface(udev, *i);
1611                 if (ifi < 0)
1612                         return ifi;
1613
1614                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1615                 if (r < 0) {
1616                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1617                         return r;
1618                 }
1619
1620                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1621                 if (r < 0) {
1622                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1623                         return r;
1624                 }
1625
1626                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1627                 if (r < 0) {
1628                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1629                         return r;
1630                 }
1631         }
1632
1633         return 0;
1634 }
1635
1636 static int setup_macvlan(pid_t pid) {
1637         _cleanup_udev_unref_ struct udev *udev = NULL;
1638         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1639         char **i;
1640         int r;
1641
1642         if (!arg_private_network)
1643                 return 0;
1644
1645         if (strv_isempty(arg_network_macvlan))
1646                 return 0;
1647
1648         r = sd_rtnl_open(&rtnl, 0);
1649         if (r < 0) {
1650                 log_error("Failed to connect to netlink: %s", strerror(-r));
1651                 return r;
1652         }
1653
1654         udev = udev_new();
1655         if (!udev) {
1656                 log_error("Failed to connect to udev.");
1657                 return -ENOMEM;
1658         }
1659
1660         STRV_FOREACH(i, arg_network_macvlan) {
1661                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1662                 _cleanup_free_ char *n = NULL;
1663                 int ifi;
1664
1665                 ifi = parse_interface(udev, *i);
1666                 if (ifi < 0)
1667                         return ifi;
1668
1669                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1670                 if (r < 0) {
1671                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1672                         return r;
1673                 }
1674
1675                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1676                 if (r < 0) {
1677                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1678                         return r;
1679                 }
1680
1681                 n = strappend("mv-", *i);
1682                 if (!n)
1683                         return log_oom();
1684
1685                 strshorten(n, IFNAMSIZ-1);
1686
1687                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1688                 if (r < 0) {
1689                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1690                         return r;
1691                 }
1692
1693                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1694                 if (r < 0) {
1695                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1696                         return r;
1697                 }
1698
1699                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1700                 if (r < 0) {
1701                         log_error("Failed to open netlink container: %s", strerror(-r));
1702                         return r;
1703                 }
1704
1705                 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1706                 if (r < 0) {
1707                         log_error("Failed to append netlink kind: %s", strerror(-r));
1708                         return r;
1709                 }
1710
1711                 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1712                 if (r < 0) {
1713                         log_error("Failed to open netlink container: %s", strerror(-r));
1714                         return r;
1715                 }
1716
1717                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1718                 if (r < 0) {
1719                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1720                         return r;
1721                 }
1722
1723                 r = sd_rtnl_message_close_container(m);
1724                 if (r < 0) {
1725                         log_error("Failed to close netlink container: %s", strerror(-r));
1726                         return r;
1727                 }
1728
1729                 r = sd_rtnl_message_close_container(m);
1730                 if (r < 0) {
1731                         log_error("Failed to close netlink container: %s", strerror(-r));
1732                         return r;
1733                 }
1734
1735                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1736                 if (r < 0) {
1737                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1738                         return r;
1739                 }
1740         }
1741
1742         return 0;
1743 }
1744
1745 static int audit_still_doesnt_work_in_containers(void) {
1746
1747 #ifdef HAVE_SECCOMP
1748         scmp_filter_ctx seccomp;
1749         int r;
1750
1751         /*
1752            Audit is broken in containers, much of the userspace audit
1753            hookup will fail if running inside a container. We don't
1754            care and just turn off creation of audit sockets.
1755
1756            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1757            with EAFNOSUPPORT which audit userspace uses as indication
1758            that audit is disabled in the kernel.
1759          */
1760
1761         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1762         if (!seccomp)
1763                 return log_oom();
1764
1765         r = seccomp_add_secondary_archs(seccomp);
1766         if (r < 0) {
1767                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1768                 goto finish;
1769         }
1770
1771         r = seccomp_rule_add(
1772                         seccomp,
1773                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1774                         SCMP_SYS(socket),
1775                         2,
1776                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1777                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1778         if (r < 0) {
1779                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1780                 goto finish;
1781         }
1782
1783         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1784         if (r < 0) {
1785                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1786                 goto finish;
1787         }
1788
1789         r = seccomp_load(seccomp);
1790         if (r < 0)
1791                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1792
1793 finish:
1794         seccomp_release(seccomp);
1795         return r;
1796 #else
1797         return 0;
1798 #endif
1799
1800 }
1801
1802 static int setup_image(char **device_path, int *loop_nr) {
1803         struct loop_info64 info = {
1804                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1805         };
1806         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1807         _cleanup_free_ char* loopdev = NULL;
1808         struct stat st;
1809         int r, nr;
1810
1811         assert(device_path);
1812         assert(loop_nr);
1813
1814         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1815         if (fd < 0) {
1816                 log_error("Failed to open %s: %m", arg_image);
1817                 return -errno;
1818         }
1819
1820         if (fstat(fd, &st) < 0) {
1821                 log_error("Failed to stat %s: %m", arg_image);
1822                 return -errno;
1823         }
1824
1825         if (S_ISBLK(st.st_mode)) {
1826                 char *p;
1827
1828                 p = strdup(arg_image);
1829                 if (!p)
1830                         return log_oom();
1831
1832                 *device_path = p;
1833
1834                 *loop_nr = -1;
1835
1836                 r = fd;
1837                 fd = -1;
1838
1839                 return r;
1840         }
1841
1842         if (!S_ISREG(st.st_mode)) {
1843                 log_error("%s is not a regular file or block device: %m", arg_image);
1844                 return -EINVAL;
1845         }
1846
1847         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1848         if (control < 0) {
1849                 log_error("Failed to open /dev/loop-control: %m");
1850                 return -errno;
1851         }
1852
1853         nr = ioctl(control, LOOP_CTL_GET_FREE);
1854         if (nr < 0) {
1855                 log_error("Failed to allocate loop device: %m");
1856                 return -errno;
1857         }
1858
1859         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1860                 return log_oom();
1861
1862         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1863         if (loop < 0) {
1864                 log_error("Failed to open loop device %s: %m", loopdev);
1865                 return -errno;
1866         }
1867
1868         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1869                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1870                 return -errno;
1871         }
1872
1873         if (arg_read_only)
1874                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1875
1876         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1877                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1878                 return -errno;
1879         }
1880
1881         *device_path = loopdev;
1882         loopdev = NULL;
1883
1884         *loop_nr = nr;
1885
1886         r = loop;
1887         loop = -1;
1888
1889         return r;
1890 }
1891
1892 static int dissect_image(
1893                 int fd,
1894                 char **root_device, bool *root_device_rw,
1895                 char **home_device, bool *home_device_rw,
1896                 char **srv_device, bool *srv_device_rw,
1897                 bool *secondary) {
1898
1899 #ifdef HAVE_BLKID
1900         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1901         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1902         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1903         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1904         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1905         _cleanup_udev_unref_ struct udev *udev = NULL;
1906         struct udev_list_entry *first, *item;
1907         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1908         const char *pttype = NULL;
1909         blkid_partlist pl;
1910         struct stat st;
1911         int r;
1912
1913         assert(fd >= 0);
1914         assert(root_device);
1915         assert(home_device);
1916         assert(srv_device);
1917         assert(secondary);
1918
1919         b = blkid_new_probe();
1920         if (!b)
1921                 return log_oom();
1922
1923         errno = 0;
1924         r = blkid_probe_set_device(b, fd, 0, 0);
1925         if (r != 0) {
1926                 if (errno == 0)
1927                         return log_oom();
1928
1929                 log_error("Failed to set device on blkid probe: %m");
1930                 return -errno;
1931         }
1932
1933         blkid_probe_enable_partitions(b, 1);
1934         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1935
1936         errno = 0;
1937         r = blkid_do_safeprobe(b);
1938         if (r == -2 || r == 1) {
1939                 log_error("Failed to identify any partition table on %s.\n"
1940                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1941                 return -EINVAL;
1942         } else if (r != 0) {
1943                 if (errno == 0)
1944                         errno = EIO;
1945                 log_error("Failed to probe: %m");
1946                 return -errno;
1947         }
1948
1949         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1950         if (!streq_ptr(pttype, "gpt")) {
1951                 log_error("Image %s does not carry a GUID Partition Table.\n"
1952                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1953                 return -EINVAL;
1954         }
1955
1956         errno = 0;
1957         pl = blkid_probe_get_partitions(b);
1958         if (!pl) {
1959                 if (errno == 0)
1960                         return log_oom();
1961
1962                 log_error("Failed to list partitions of %s", arg_image);
1963                 return -errno;
1964         }
1965
1966         udev = udev_new();
1967         if (!udev)
1968                 return log_oom();
1969
1970         if (fstat(fd, &st) < 0) {
1971                 log_error("Failed to stat block device: %m");
1972                 return -errno;
1973         }
1974
1975         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1976         if (!d)
1977                 return log_oom();
1978
1979         e = udev_enumerate_new(udev);
1980         if (!e)
1981                 return log_oom();
1982
1983         r = udev_enumerate_add_match_parent(e, d);
1984         if (r < 0)
1985                 return log_oom();
1986
1987         r = udev_enumerate_scan_devices(e);
1988         if (r < 0) {
1989                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
1990                 return r;
1991         }
1992
1993         first = udev_enumerate_get_list_entry(e);
1994         udev_list_entry_foreach(item, first) {
1995                 _cleanup_udev_device_unref_ struct udev_device *q;
1996                 const char *stype, *node;
1997                 unsigned long long flags;
1998                 sd_id128_t type_id;
1999                 blkid_partition pp;
2000                 dev_t qn;
2001                 int nr;
2002
2003                 errno = 0;
2004                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2005                 if (!q) {
2006                         if (!errno)
2007                                 errno = ENOMEM;
2008
2009                         log_error("Failed to get partition device of %s: %m", arg_image);
2010                         return -errno;
2011                 }
2012
2013                 qn = udev_device_get_devnum(q);
2014                 if (major(qn) == 0)
2015                         continue;
2016
2017                 if (st.st_rdev == qn)
2018                         continue;
2019
2020                 node = udev_device_get_devnode(q);
2021                 if (!node)
2022                         continue;
2023
2024                 pp = blkid_partlist_devno_to_partition(pl, qn);
2025                 if (!pp)
2026                         continue;
2027
2028                 flags = blkid_partition_get_flags(pp);
2029                 if (flags & GPT_FLAG_NO_AUTO)
2030                         continue;
2031
2032                 nr = blkid_partition_get_partno(pp);
2033                 if (nr < 0)
2034                         continue;
2035
2036                 stype = blkid_partition_get_type_string(pp);
2037                 if (!stype)
2038                         continue;
2039
2040                 if (sd_id128_from_string(stype, &type_id) < 0)
2041                         continue;
2042
2043                 if (sd_id128_equal(type_id, GPT_HOME)) {
2044
2045                         if (home && nr >= home_nr)
2046                                 continue;
2047
2048                         home_nr = nr;
2049                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2050
2051                         free(home);
2052                         home = strdup(node);
2053                         if (!home)
2054                                 return log_oom();
2055                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2056
2057                         if (srv && nr >= srv_nr)
2058                                 continue;
2059
2060                         srv_nr = nr;
2061                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2062
2063                         free(srv);
2064                         srv = strdup(node);
2065                         if (!srv)
2066                                 return log_oom();
2067                 }
2068 #ifdef GPT_ROOT_NATIVE
2069                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2070
2071                         if (root && nr >= root_nr)
2072                                 continue;
2073
2074                         root_nr = nr;
2075                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2076
2077                         free(root);
2078                         root = strdup(node);
2079                         if (!root)
2080                                 return log_oom();
2081                 }
2082 #endif
2083 #ifdef GPT_ROOT_SECONDARY
2084                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2085
2086                         if (secondary_root && nr >= secondary_root_nr)
2087                                 continue;
2088
2089                         secondary_root_nr = nr;
2090                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2091
2092
2093                         free(secondary_root);
2094                         secondary_root = strdup(node);
2095                         if (!secondary_root)
2096                                 return log_oom();
2097                 }
2098 #endif
2099         }
2100
2101         if (!root && !secondary_root) {
2102                 log_error("Failed to identify root partition in disk image %s.\n"
2103                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2104                 return -EINVAL;
2105         }
2106
2107         if (root) {
2108                 *root_device = root;
2109                 root = NULL;
2110
2111                 *root_device_rw = root_rw;
2112                 *secondary = false;
2113         } else if (secondary_root) {
2114                 *root_device = secondary_root;
2115                 secondary_root = NULL;
2116
2117                 *root_device_rw = secondary_root_rw;
2118                 *secondary = true;
2119         }
2120
2121         if (home) {
2122                 *home_device = home;
2123                 home = NULL;
2124
2125                 *home_device_rw = home_rw;
2126         }
2127
2128         if (srv) {
2129                 *srv_device = srv;
2130                 srv = NULL;
2131
2132                 *srv_device_rw = srv_rw;
2133         }
2134
2135         return 0;
2136 #else
2137         log_error("--image= is not supported, compiled without blkid support.");
2138         return -ENOTSUP;
2139 #endif
2140 }
2141
2142 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2143 #ifdef HAVE_BLKID
2144         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2145         const char *fstype, *p;
2146         int r;
2147
2148         assert(what);
2149         assert(where);
2150
2151         if (arg_read_only)
2152                 rw = false;
2153
2154         if (directory)
2155                 p = strappenda(where, directory);
2156         else
2157                 p = where;
2158
2159         errno = 0;
2160         b = blkid_new_probe_from_filename(what);
2161         if (!b) {
2162                 if (errno == 0)
2163                         return log_oom();
2164                 log_error("Failed to allocate prober for %s: %m", what);
2165                 return -errno;
2166         }
2167
2168         blkid_probe_enable_superblocks(b, 1);
2169         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2170
2171         errno = 0;
2172         r = blkid_do_safeprobe(b);
2173         if (r == -1 || r == 1) {
2174                 log_error("Cannot determine file system type of %s", what);
2175                 return -EINVAL;
2176         } else if (r != 0) {
2177                 if (errno == 0)
2178                         errno = EIO;
2179                 log_error("Failed to probe %s: %m", what);
2180                 return -errno;
2181         }
2182
2183         errno = 0;
2184         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2185                 if (errno == 0)
2186                         errno = EINVAL;
2187                 log_error("Failed to determine file system type of %s", what);
2188                 return -errno;
2189         }
2190
2191         if (streq(fstype, "crypto_LUKS")) {
2192                 log_error("nspawn currently does not support LUKS disk images.");
2193                 return -ENOTSUP;
2194         }
2195
2196         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2197                 log_error("Failed to mount %s: %m", what);
2198                 return -errno;
2199         }
2200
2201         return 0;
2202 #else
2203         log_error("--image= is not supported, compiled without blkid support.");
2204         return -ENOTSUP;
2205 #endif
2206 }
2207
2208 static int mount_devices(
2209                 const char *where,
2210                 const char *root_device, bool root_device_rw,
2211                 const char *home_device, bool home_device_rw,
2212                 const char *srv_device, bool srv_device_rw) {
2213         int r;
2214
2215         assert(where);
2216
2217         if (root_device) {
2218                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2219                 if (r < 0) {
2220                         log_error("Failed to mount root directory: %s", strerror(-r));
2221                         return r;
2222                 }
2223         }
2224
2225         if (home_device) {
2226                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2227                 if (r < 0) {
2228                         log_error("Failed to mount home directory: %s", strerror(-r));
2229                         return r;
2230                 }
2231         }
2232
2233         if (srv_device) {
2234                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2235                 if (r < 0) {
2236                         log_error("Failed to mount server data directory: %s", strerror(-r));
2237                         return r;
2238                 }
2239         }
2240
2241         return 0;
2242 }
2243
2244 static void loop_remove(int nr, int *image_fd) {
2245         _cleanup_close_ int control = -1;
2246
2247         if (nr < 0)
2248                 return;
2249
2250         if (image_fd && *image_fd >= 0) {
2251                 ioctl(*image_fd, LOOP_CLR_FD);
2252                 close_nointr_nofail(*image_fd);
2253                 *image_fd = -1;
2254         }
2255
2256         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2257         if (control < 0)
2258                 return;
2259
2260         ioctl(control, LOOP_CTL_REMOVE, nr);
2261 }
2262
2263 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2264         int pipe_fds[2];
2265         pid_t pid;
2266
2267         assert(database);
2268         assert(key);
2269         assert(rpid);
2270
2271         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2272                 log_error("Failed to allocate pipe: %m");
2273                 return -errno;
2274         }
2275
2276         pid = fork();
2277         if (pid < 0) {
2278                 log_error("Failed to fork getent child: %m");
2279                 return -errno;
2280         } else if (pid == 0) {
2281                 int nullfd;
2282                 char *empty_env = NULL;
2283
2284                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2285                         _exit(EXIT_FAILURE);
2286
2287                 if (pipe_fds[0] > 2)
2288                         close_nointr_nofail(pipe_fds[0]);
2289                 if (pipe_fds[1] > 2)
2290                         close_nointr_nofail(pipe_fds[1]);
2291
2292                 nullfd = open("/dev/null", O_RDWR);
2293                 if (nullfd < 0)
2294                         _exit(EXIT_FAILURE);
2295
2296                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2297                         _exit(EXIT_FAILURE);
2298
2299                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2300                         _exit(EXIT_FAILURE);
2301
2302                 if (nullfd > 2)
2303                         close_nointr_nofail(nullfd);
2304
2305                 reset_all_signal_handlers();
2306                 close_all_fds(NULL, 0);
2307
2308                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2309                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2310                 _exit(EXIT_FAILURE);
2311         }
2312
2313         close_nointr_nofail(pipe_fds[1]);
2314         pipe_fds[1] = -1;
2315
2316         *rpid = pid;
2317
2318         return pipe_fds[0];
2319 }
2320
2321 static int change_uid_gid(char **_home) {
2322
2323         _cleanup_strv_free_ char **passwd = NULL;
2324         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2325         _cleanup_free_ uid_t *uids = NULL;
2326         _cleanup_free_ char *home = NULL;
2327         _cleanup_fclose_ FILE *f = NULL;
2328         _cleanup_close_ int fd = -1;
2329         unsigned n_uids = 0;
2330         size_t sz, l;
2331         uid_t uid;
2332         gid_t gid;
2333         pid_t pid;
2334         int r;
2335
2336         assert(_home);
2337
2338         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2339                 /* Reset everything fully to 0, just in case */
2340
2341                 if (setgroups(0, NULL) < 0) {
2342                         log_error("setgroups() failed: %m");
2343                         return -errno;
2344                 }
2345
2346                 if (setresgid(0, 0, 0) < 0) {
2347                         log_error("setregid() failed: %m");
2348                         return -errno;
2349                 }
2350
2351                 if (setresuid(0, 0, 0) < 0) {
2352                         log_error("setreuid() failed: %m");
2353                         return -errno;
2354                 }
2355
2356                 *_home = NULL;
2357                 return 0;
2358         }
2359
2360         /* First, get user credentials */
2361         fd = spawn_getent("passwd", arg_user, &pid);
2362         if (fd < 0)
2363                 return fd;
2364
2365         f = fdopen(fd, "r");
2366         if (!f)
2367                 return log_oom();
2368         fd = -1;
2369
2370         if (!fgets(line, sizeof(line), f)) {
2371
2372                 if (!ferror(f)) {
2373                         log_error("Failed to resolve user %s.", arg_user);
2374                         return -ESRCH;
2375                 }
2376
2377                 log_error("Failed to read from getent: %m");
2378                 return -errno;
2379         }
2380
2381         truncate_nl(line);
2382
2383         wait_for_terminate_and_warn("getent passwd", pid);
2384
2385         x = strchr(line, ':');
2386         if (!x) {
2387                 log_error("/etc/passwd entry has invalid user field.");
2388                 return -EIO;
2389         }
2390
2391         u = strchr(x+1, ':');
2392         if (!u) {
2393                 log_error("/etc/passwd entry has invalid password field.");
2394                 return -EIO;
2395         }
2396
2397         u++;
2398         g = strchr(u, ':');
2399         if (!g) {
2400                 log_error("/etc/passwd entry has invalid UID field.");
2401                 return -EIO;
2402         }
2403
2404         *g = 0;
2405         g++;
2406         x = strchr(g, ':');
2407         if (!x) {
2408                 log_error("/etc/passwd entry has invalid GID field.");
2409                 return -EIO;
2410         }
2411
2412         *x = 0;
2413         h = strchr(x+1, ':');
2414         if (!h) {
2415                 log_error("/etc/passwd entry has invalid GECOS field.");
2416                 return -EIO;
2417         }
2418
2419         h++;
2420         x = strchr(h, ':');
2421         if (!x) {
2422                 log_error("/etc/passwd entry has invalid home directory field.");
2423                 return -EIO;
2424         }
2425
2426         *x = 0;
2427
2428         r = parse_uid(u, &uid);
2429         if (r < 0) {
2430                 log_error("Failed to parse UID of user.");
2431                 return -EIO;
2432         }
2433
2434         r = parse_gid(g, &gid);
2435         if (r < 0) {
2436                 log_error("Failed to parse GID of user.");
2437                 return -EIO;
2438         }
2439
2440         home = strdup(h);
2441         if (!home)
2442                 return log_oom();
2443
2444         /* Second, get group memberships */
2445         fd = spawn_getent("initgroups", arg_user, &pid);
2446         if (fd < 0)
2447                 return fd;
2448
2449         fclose(f);
2450         f = fdopen(fd, "r");
2451         if (!f)
2452                 return log_oom();
2453         fd = -1;
2454
2455         if (!fgets(line, sizeof(line), f)) {
2456                 if (!ferror(f)) {
2457                         log_error("Failed to resolve user %s.", arg_user);
2458                         return -ESRCH;
2459                 }
2460
2461                 log_error("Failed to read from getent: %m");
2462                 return -errno;
2463         }
2464
2465         truncate_nl(line);
2466
2467         wait_for_terminate_and_warn("getent initgroups", pid);
2468
2469         /* Skip over the username and subsequent separator whitespace */
2470         x = line;
2471         x += strcspn(x, WHITESPACE);
2472         x += strspn(x, WHITESPACE);
2473
2474         FOREACH_WORD(w, l, x, state) {
2475                 char c[l+1];
2476
2477                 memcpy(c, w, l);
2478                 c[l] = 0;
2479
2480                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2481                         return log_oom();
2482
2483                 r = parse_uid(c, &uids[n_uids++]);
2484                 if (r < 0) {
2485                         log_error("Failed to parse group data from getent.");
2486                         return -EIO;
2487                 }
2488         }
2489
2490         r = mkdir_parents(home, 0775);
2491         if (r < 0) {
2492                 log_error("Failed to make home root directory: %s", strerror(-r));
2493                 return r;
2494         }
2495
2496         r = mkdir_safe(home, 0755, uid, gid);
2497         if (r < 0) {
2498                 log_error("Failed to make home directory: %s", strerror(-r));
2499                 return r;
2500         }
2501
2502         fchown(STDIN_FILENO, uid, gid);
2503         fchown(STDOUT_FILENO, uid, gid);
2504         fchown(STDERR_FILENO, uid, gid);
2505
2506         if (setgroups(n_uids, uids) < 0) {
2507                 log_error("Failed to set auxiliary groups: %m");
2508                 return -errno;
2509         }
2510
2511         if (setresgid(gid, gid, gid) < 0) {
2512                 log_error("setregid() failed: %m");
2513                 return -errno;
2514         }
2515
2516         if (setresuid(uid, uid, uid) < 0) {
2517                 log_error("setreuid() failed: %m");
2518                 return -errno;
2519         }
2520
2521         if (_home) {
2522                 *_home = home;
2523                 home = NULL;
2524         }
2525
2526         return 0;
2527 }
2528
2529 int main(int argc, char *argv[]) {
2530
2531         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2532         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2533         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2534         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2535         _cleanup_fdset_free_ FDSet *fds = NULL;
2536         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2537         const char *console = NULL;
2538         char veth_name[IFNAMSIZ];
2539         bool secondary = false;
2540         pid_t pid = 0;
2541         sigset_t mask;
2542
2543         log_parse_environment();
2544         log_open();
2545
2546         k = parse_argv(argc, argv);
2547         if (k < 0)
2548                 goto finish;
2549         else if (k == 0) {
2550                 r = EXIT_SUCCESS;
2551                 goto finish;
2552         }
2553
2554         if (!arg_image) {
2555                 if (arg_directory) {
2556                         char *p;
2557
2558                         p = path_make_absolute_cwd(arg_directory);
2559                         free(arg_directory);
2560                         arg_directory = p;
2561                 } else
2562                         arg_directory = get_current_dir_name();
2563
2564                 if (!arg_directory) {
2565                         log_error("Failed to determine path, please use -D.");
2566                         goto finish;
2567                 }
2568                 path_kill_slashes(arg_directory);
2569         }
2570
2571         if (!arg_machine) {
2572                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2573                 if (!arg_machine) {
2574                         log_oom();
2575                         goto finish;
2576                 }
2577
2578                 hostname_cleanup(arg_machine, false);
2579                 if (isempty(arg_machine)) {
2580                         log_error("Failed to determine machine name automatically, please use -M.");
2581                         goto finish;
2582                 }
2583         }
2584
2585         if (geteuid() != 0) {
2586                 log_error("Need to be root.");
2587                 goto finish;
2588         }
2589
2590         if (sd_booted() <= 0) {
2591                 log_error("Not running on a systemd system.");
2592                 goto finish;
2593         }
2594
2595         log_close();
2596         n_fd_passed = sd_listen_fds(false);
2597         if (n_fd_passed > 0) {
2598                 k = fdset_new_listen_fds(&fds, false);
2599                 if (k < 0) {
2600                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2601                         goto finish;
2602                 }
2603         }
2604         fdset_close_others(fds);
2605         log_open();
2606
2607         if (arg_directory) {
2608                 if (path_equal(arg_directory, "/")) {
2609                         log_error("Spawning container on root directory not supported.");
2610                         goto finish;
2611                 }
2612
2613                 if (arg_boot) {
2614                         if (path_is_os_tree(arg_directory) <= 0) {
2615                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2616                                 goto finish;
2617                         }
2618                 } else {
2619                         const char *p;
2620
2621                         p = strappenda(arg_directory,
2622                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2623                         if (access(p, F_OK) < 0) {
2624                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2625                                 goto finish;
2626
2627                         }
2628                 }
2629         } else {
2630                 char template[] = "/tmp/nspawn-root-XXXXXX";
2631
2632                 if (!mkdtemp(template)) {
2633                         log_error("Failed to create temporary directory: %m");
2634                         r = -errno;
2635                         goto finish;
2636                 }
2637
2638                 arg_directory = strdup(template);
2639                 if (!arg_directory) {
2640                         r = log_oom();
2641                         goto finish;
2642                 }
2643
2644                 image_fd = setup_image(&device_path, &loop_nr);
2645                 if (image_fd < 0) {
2646                         r = image_fd;
2647                         goto finish;
2648                 }
2649
2650                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2651                 if (r < 0)
2652                         goto finish;
2653         }
2654
2655         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2656         if (master < 0) {
2657                 log_error("Failed to acquire pseudo tty: %m");
2658                 goto finish;
2659         }
2660
2661         console = ptsname(master);
2662         if (!console) {
2663                 log_error("Failed to determine tty name: %m");
2664                 goto finish;
2665         }
2666
2667         if (!arg_quiet)
2668                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2669
2670         if (unlockpt(master) < 0) {
2671                 log_error("Failed to unlock tty: %m");
2672                 goto finish;
2673         }
2674
2675         if (access("/dev/kdbus/control", F_OK) >= 0) {
2676
2677                 if (arg_share_system) {
2678                         kdbus_domain = strdup("/dev/kdbus");
2679                         if (!kdbus_domain) {
2680                                 log_oom();
2681                                 goto finish;
2682                         }
2683                 } else {
2684                         const char *ns;
2685
2686                         ns = strappenda("machine-", arg_machine);
2687                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2688                         if (r < 0)
2689                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2690                         else
2691                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2692                 }
2693         }
2694
2695         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2696                 log_error("Failed to create kmsg socket pair: %m");
2697                 goto finish;
2698         }
2699
2700         sd_notify(0, "READY=1");
2701
2702         assert_se(sigemptyset(&mask) == 0);
2703         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2704         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2705
2706         for (;;) {
2707                 int parent_ready_fd = -1, child_ready_fd = -1;
2708                 siginfo_t status;
2709                 eventfd_t x;
2710
2711                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2712                 if (parent_ready_fd < 0) {
2713                         log_error("Failed to create event fd: %m");
2714                         goto finish;
2715                 }
2716
2717                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2718                 if (child_ready_fd < 0) {
2719                         log_error("Failed to create event fd: %m");
2720                         goto finish;
2721                 }
2722
2723                 pid = syscall(__NR_clone,
2724                               SIGCHLD|CLONE_NEWNS|
2725                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2726                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2727                 if (pid < 0) {
2728                         if (errno == EINVAL)
2729                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2730                         else
2731                                 log_error("clone() failed: %m");
2732
2733                         goto finish;
2734                 }
2735
2736                 if (pid == 0) {
2737                         /* child */
2738                         _cleanup_free_ char *home = NULL;
2739                         unsigned n_env = 2;
2740                         const char *envp[] = {
2741                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2742                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2743                                 NULL, /* TERM */
2744                                 NULL, /* HOME */
2745                                 NULL, /* USER */
2746                                 NULL, /* LOGNAME */
2747                                 NULL, /* container_uuid */
2748                                 NULL, /* LISTEN_FDS */
2749                                 NULL, /* LISTEN_PID */
2750                                 NULL
2751                         };
2752                         char **env_use;
2753
2754                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2755                         if (envp[n_env])
2756                                 n_env ++;
2757
2758                         close_nointr_nofail(master);
2759                         master = -1;
2760
2761                         close_nointr(STDIN_FILENO);
2762                         close_nointr(STDOUT_FILENO);
2763                         close_nointr(STDERR_FILENO);
2764
2765                         close_nointr_nofail(kmsg_socket_pair[0]);
2766                         kmsg_socket_pair[0] = -1;
2767
2768                         reset_all_signal_handlers();
2769
2770                         assert_se(sigemptyset(&mask) == 0);
2771                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2772
2773                         k = open_terminal(console, O_RDWR);
2774                         if (k != STDIN_FILENO) {
2775                                 if (k >= 0) {
2776                                         close_nointr_nofail(k);
2777                                         k = -EINVAL;
2778                                 }
2779
2780                                 log_error("Failed to open console: %s", strerror(-k));
2781                                 goto child_fail;
2782                         }
2783
2784                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2785                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2786                                 log_error("Failed to duplicate console: %m");
2787                                 goto child_fail;
2788                         }
2789
2790                         if (setsid() < 0) {
2791                                 log_error("setsid() failed: %m");
2792                                 goto child_fail;
2793                         }
2794
2795                         if (reset_audit_loginuid() < 0)
2796                                 goto child_fail;
2797
2798                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2799                                 log_error("PR_SET_PDEATHSIG failed: %m");
2800                                 goto child_fail;
2801                         }
2802
2803                         /* Mark everything as slave, so that we still
2804                          * receive mounts from the real root, but don't
2805                          * propagate mounts to the real root. */
2806                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2807                                 log_error("MS_SLAVE|MS_REC failed: %m");
2808                                 goto child_fail;
2809                         }
2810
2811                         if (mount_devices(arg_directory,
2812                                           root_device, root_device_rw,
2813                                           home_device, home_device_rw,
2814                                           srv_device, srv_device_rw) < 0)
2815                                 goto child_fail;
2816
2817                         /* Turn directory into bind mount */
2818                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2819                                 log_error("Failed to make bind mount.");
2820                                 goto child_fail;
2821                         }
2822
2823                         if (arg_read_only)
2824                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2825                                         log_error("Failed to make read-only.");
2826                                         goto child_fail;
2827                                 }
2828
2829                         if (mount_all(arg_directory) < 0)
2830                                 goto child_fail;
2831
2832                         if (copy_devnodes(arg_directory) < 0)
2833                                 goto child_fail;
2834
2835                         if (setup_ptmx(arg_directory) < 0)
2836                                 goto child_fail;
2837
2838                         dev_setup(arg_directory);
2839
2840                         if (audit_still_doesnt_work_in_containers() < 0)
2841                                 goto child_fail;
2842
2843                         if (setup_dev_console(arg_directory, console) < 0)
2844                                 goto child_fail;
2845
2846                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2847                                 goto child_fail;
2848
2849                         close_nointr_nofail(kmsg_socket_pair[1]);
2850                         kmsg_socket_pair[1] = -1;
2851
2852                         if (setup_boot_id(arg_directory) < 0)
2853                                 goto child_fail;
2854
2855                         if (setup_timezone(arg_directory) < 0)
2856                                 goto child_fail;
2857
2858                         if (setup_resolv_conf(arg_directory) < 0)
2859                                 goto child_fail;
2860
2861                         if (setup_journal(arg_directory) < 0)
2862                                 goto child_fail;
2863
2864                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2865                                 goto child_fail;
2866
2867                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2868                                 goto child_fail;
2869
2870                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2871                                 goto child_fail;
2872
2873                         /* Tell the parent that we are ready, and that
2874                          * it can cgroupify us to that we lack access
2875                          * to certain devices and resources. */
2876                         eventfd_write(child_ready_fd, 1);
2877                         close_nointr_nofail(child_ready_fd);
2878                         child_ready_fd = -1;
2879
2880                         if (chdir(arg_directory) < 0) {
2881                                 log_error("chdir(%s) failed: %m", arg_directory);
2882                                 goto child_fail;
2883                         }
2884
2885                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2886                                 log_error("mount(MS_MOVE) failed: %m");
2887                                 goto child_fail;
2888                         }
2889
2890                         if (chroot(".") < 0) {
2891                                 log_error("chroot() failed: %m");
2892                                 goto child_fail;
2893                         }
2894
2895                         if (chdir("/") < 0) {
2896                                 log_error("chdir() failed: %m");
2897                                 goto child_fail;
2898                         }
2899
2900                         umask(0022);
2901
2902                         if (arg_private_network)
2903                                 loopback_setup();
2904
2905                         if (drop_capabilities() < 0) {
2906                                 log_error("drop_capabilities() failed: %m");
2907                                 goto child_fail;
2908                         }
2909
2910                         r = change_uid_gid(&home);
2911                         if (r < 0)
2912                                 goto child_fail;
2913
2914                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2915                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2916                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2917                                 log_oom();
2918                                 goto child_fail;
2919                         }
2920
2921                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2922                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2923                                         log_oom();
2924                                         goto child_fail;
2925                                 }
2926                         }
2927
2928                         if (fdset_size(fds) > 0) {
2929                                 k = fdset_cloexec(fds, false);
2930                                 if (k < 0) {
2931                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2932                                         goto child_fail;
2933                                 }
2934
2935                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2936                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2937                                         log_oom();
2938                                         goto child_fail;
2939                                 }
2940                         }
2941
2942                         setup_hostname();
2943
2944                         if (arg_personality != 0xffffffffLU) {
2945                                 if (personality(arg_personality) < 0) {
2946                                         log_error("personality() failed: %m");
2947                                         goto child_fail;
2948                                 }
2949                         } else if (secondary) {
2950                                 if (personality(PER_LINUX32) < 0) {
2951                                         log_error("personality() failed: %m");
2952                                         goto child_fail;
2953                                 }
2954                         }
2955
2956 #ifdef HAVE_SELINUX
2957                         if (arg_selinux_context)
2958                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
2959                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2960                                         goto child_fail;
2961                                 }
2962 #endif
2963
2964                         if (!strv_isempty(arg_setenv)) {
2965                                 char **n;
2966
2967                                 n = strv_env_merge(2, envp, arg_setenv);
2968                                 if (!n) {
2969                                         log_oom();
2970                                         goto child_fail;
2971                                 }
2972
2973                                 env_use = n;
2974                         } else
2975                                 env_use = (char**) envp;
2976
2977                         /* Wait until the parent is ready with the setup, too... */
2978                         eventfd_read(parent_ready_fd, &x);
2979                         close_nointr_nofail(parent_ready_fd);
2980                         parent_ready_fd = -1;
2981
2982                         if (arg_boot) {
2983                                 char **a;
2984                                 size_t l;
2985
2986                                 /* Automatically search for the init system */
2987
2988                                 l = 1 + argc - optind;
2989                                 a = newa(char*, l + 1);
2990                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
2991
2992                                 a[0] = (char*) "/usr/lib/systemd/systemd";
2993                                 execve(a[0], a, env_use);
2994
2995                                 a[0] = (char*) "/lib/systemd/systemd";
2996                                 execve(a[0], a, env_use);
2997
2998                                 a[0] = (char*) "/sbin/init";
2999                                 execve(a[0], a, env_use);
3000                         } else if (argc > optind)
3001                                 execvpe(argv[optind], argv + optind, env_use);
3002                         else {
3003                                 chdir(home ? home : "/root");
3004                                 execle("/bin/bash", "-bash", NULL, env_use);
3005                                 execle("/bin/sh", "-sh", NULL, env_use);
3006                         }
3007
3008                         log_error("execv() failed: %m");
3009
3010                 child_fail:
3011                         _exit(EXIT_FAILURE);
3012                 }
3013
3014                 fdset_free(fds);
3015                 fds = NULL;
3016
3017                 /* Wait until the child reported that it is ready with
3018                  * all it needs to do with priviliges. After we got
3019                  * the notification we can make the process join its
3020                  * cgroup which might limit what it can do */
3021                 eventfd_read(child_ready_fd, &x);
3022
3023                 r = register_machine(pid);
3024                 if (r < 0)
3025                         goto finish;
3026
3027                 r = move_network_interfaces(pid);
3028                 if (r < 0)
3029                         goto finish;
3030
3031                 r = setup_veth(pid, veth_name);
3032                 if (r < 0)
3033                         goto finish;
3034
3035                 r = setup_bridge(veth_name);
3036                 if (r < 0)
3037                         goto finish;
3038
3039                 r = setup_macvlan(pid);
3040                 if (r < 0)
3041                         goto finish;
3042
3043                 /* Notify the child that the parent is ready with all
3044                  * its setup, and thtat the child can now hand over
3045                  * control to the code to run inside the container. */
3046                 eventfd_write(parent_ready_fd, 1);
3047
3048                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3049                 if (k < 0) {
3050                         r = EXIT_FAILURE;
3051                         break;
3052                 }
3053
3054                 if (!arg_quiet)
3055                         putc('\n', stdout);
3056
3057                 /* Kill if it is not dead yet anyway */
3058                 terminate_machine(pid);
3059
3060                 /* Redundant, but better safe than sorry */
3061                 kill(pid, SIGKILL);
3062
3063                 k = wait_for_terminate(pid, &status);
3064                 pid = 0;
3065
3066                 if (k < 0) {
3067                         r = EXIT_FAILURE;
3068                         break;
3069                 }
3070
3071                 if (status.si_code == CLD_EXITED) {
3072                         r = status.si_status;
3073                         if (status.si_status != 0) {
3074                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3075                                 break;
3076                         }
3077
3078                         if (!arg_quiet)
3079                                 log_debug("Container %s exited successfully.", arg_machine);
3080                         break;
3081                 } else if (status.si_code == CLD_KILLED &&
3082                            status.si_status == SIGINT) {
3083
3084                         if (!arg_quiet)
3085                                 log_info("Container %s has been shut down.", arg_machine);
3086                         r = 0;
3087                         break;
3088                 } else if (status.si_code == CLD_KILLED &&
3089                            status.si_status == SIGHUP) {
3090
3091                         if (!arg_quiet)
3092                                 log_info("Container %s is being rebooted.", arg_machine);
3093                         continue;
3094                 } else if (status.si_code == CLD_KILLED ||
3095                            status.si_code == CLD_DUMPED) {
3096
3097                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3098                         r = EXIT_FAILURE;
3099                         break;
3100                 } else {
3101                         log_error("Container %s failed due to unknown reason.", arg_machine);
3102                         r = EXIT_FAILURE;
3103                         break;
3104                 }
3105         }
3106
3107 finish:
3108         loop_remove(loop_nr, &image_fd);
3109
3110         if (pid > 0)
3111                 kill(pid, SIGKILL);
3112
3113         free(arg_directory);
3114         free(arg_machine);
3115         free(arg_user);
3116         strv_free(arg_setenv);
3117         strv_free(arg_network_interfaces);
3118         strv_free(arg_network_macvlan);
3119         strv_free(arg_bind);
3120         strv_free(arg_bind_ro);
3121
3122         return r;
3123 }