chiark / gitweb /
867cf1926783ff07c9b7467e5ab4082931d5446e
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91 #include "copy.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 static char *arg_directory = NULL;
110 static char *arg_user = NULL;
111 static sd_id128_t arg_uuid = {};
112 static char *arg_machine = NULL;
113 static const char *arg_selinux_context = NULL;
114 static const char *arg_selinux_apifs_context = NULL;
115 static const char *arg_slice = NULL;
116 static bool arg_private_network = false;
117 static bool arg_read_only = false;
118 static bool arg_boot = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static uint64_t arg_retain =
121         (1ULL << CAP_CHOWN) |
122         (1ULL << CAP_DAC_OVERRIDE) |
123         (1ULL << CAP_DAC_READ_SEARCH) |
124         (1ULL << CAP_FOWNER) |
125         (1ULL << CAP_FSETID) |
126         (1ULL << CAP_IPC_OWNER) |
127         (1ULL << CAP_KILL) |
128         (1ULL << CAP_LEASE) |
129         (1ULL << CAP_LINUX_IMMUTABLE) |
130         (1ULL << CAP_NET_BIND_SERVICE) |
131         (1ULL << CAP_NET_BROADCAST) |
132         (1ULL << CAP_NET_RAW) |
133         (1ULL << CAP_SETGID) |
134         (1ULL << CAP_SETFCAP) |
135         (1ULL << CAP_SETPCAP) |
136         (1ULL << CAP_SETUID) |
137         (1ULL << CAP_SYS_ADMIN) |
138         (1ULL << CAP_SYS_CHROOT) |
139         (1ULL << CAP_SYS_NICE) |
140         (1ULL << CAP_SYS_PTRACE) |
141         (1ULL << CAP_SYS_TTY_CONFIG) |
142         (1ULL << CAP_SYS_RESOURCE) |
143         (1ULL << CAP_SYS_BOOT) |
144         (1ULL << CAP_AUDIT_WRITE) |
145         (1ULL << CAP_AUDIT_CONTROL) |
146         (1ULL << CAP_MKNOD);
147 static char **arg_bind = NULL;
148 static char **arg_bind_ro = NULL;
149 static char **arg_setenv = NULL;
150 static bool arg_quiet = false;
151 static bool arg_share_system = false;
152 static bool arg_register = true;
153 static bool arg_keep_unit = false;
154 static char **arg_network_interfaces = NULL;
155 static char **arg_network_macvlan = NULL;
156 static bool arg_network_veth = false;
157 static const char *arg_network_bridge = NULL;
158 static unsigned long arg_personality = 0xffffffffLU;
159 static const char *arg_image = NULL;
160
161 static int help(void) {
162
163         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
164                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
165                "  -h --help                 Show this help\n"
166                "     --version              Print version string\n"
167                "  -q --quiet                Do not show status information\n"
168                "  -D --directory=PATH       Root directory for the container\n"
169                "  -i --image=PATH           File system device or image for the container\n"
170                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
171                "  -u --user=USER            Run the command under specified user or uid\n"
172                "  -M --machine=NAME         Set the machine name for the container\n"
173                "     --uuid=UUID            Set a specific machine UUID for the container\n"
174                "  -S --slice=SLICE          Place the container in the specified slice\n"
175                "     --private-network      Disable network in container\n"
176                "     --network-interface=INTERFACE\n"
177                "                            Assign an existing network interface to the\n"
178                "                            container\n"
179                "     --network-macvlan=INTERFACE\n"
180                "                            Create a macvlan network interface based on an\n"
181                "                            existing network interface to the container\n"
182                "     --network-veth         Add a virtual ethernet connection between host\n"
183                "                            and container\n"
184                "     --network-bridge=INTERFACE\n"
185                "                            Add a virtual ethernet connection between host\n"
186                "                            and container and add it to an existing bridge on\n"
187                "                            the host\n"
188                "  -Z --selinux-context=SECLABEL\n"
189                "                            Set the SELinux security context to be used by\n"
190                "                            processes in the container\n"
191                "  -L --selinux-apifs-context=SECLABEL\n"
192                "                            Set the SELinux security context to be used by\n"
193                "                            API/tmpfs file systems in the container\n"
194                "     --capability=CAP       In addition to the default, retain specified\n"
195                "                            capability\n"
196                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
197                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
198                "  -j                        Equivalent to --link-journal=host\n"
199                "     --read-only            Mount the root directory read-only\n"
200                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
201                "                            the container\n"
202                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
203                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
204                "     --share-system         Share system namespaces with host\n"
205                "     --register=BOOLEAN     Register container as machine\n"
206                "     --keep-unit            Do not register a scope for the machine, reuse\n"
207                "                            the service unit nspawn is running in\n",
208                program_invocation_short_name);
209
210         return 0;
211 }
212
213 static int parse_argv(int argc, char *argv[]) {
214
215         enum {
216                 ARG_VERSION = 0x100,
217                 ARG_PRIVATE_NETWORK,
218                 ARG_UUID,
219                 ARG_READ_ONLY,
220                 ARG_CAPABILITY,
221                 ARG_DROP_CAPABILITY,
222                 ARG_LINK_JOURNAL,
223                 ARG_BIND,
224                 ARG_BIND_RO,
225                 ARG_SETENV,
226                 ARG_SHARE_SYSTEM,
227                 ARG_REGISTER,
228                 ARG_KEEP_UNIT,
229                 ARG_NETWORK_INTERFACE,
230                 ARG_NETWORK_MACVLAN,
231                 ARG_NETWORK_VETH,
232                 ARG_NETWORK_BRIDGE,
233                 ARG_PERSONALITY,
234         };
235
236         static const struct option options[] = {
237                 { "help",                  no_argument,       NULL, 'h'                   },
238                 { "version",               no_argument,       NULL, ARG_VERSION           },
239                 { "directory",             required_argument, NULL, 'D'                   },
240                 { "user",                  required_argument, NULL, 'u'                   },
241                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
242                 { "boot",                  no_argument,       NULL, 'b'                   },
243                 { "uuid",                  required_argument, NULL, ARG_UUID              },
244                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
245                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
246                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
247                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
248                 { "bind",                  required_argument, NULL, ARG_BIND              },
249                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
250                 { "machine",               required_argument, NULL, 'M'                   },
251                 { "slice",                 required_argument, NULL, 'S'                   },
252                 { "setenv",                required_argument, NULL, ARG_SETENV            },
253                 { "selinux-context",       required_argument, NULL, 'Z'                   },
254                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
255                 { "quiet",                 no_argument,       NULL, 'q'                   },
256                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
257                 { "register",              required_argument, NULL, ARG_REGISTER          },
258                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
259                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
260                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
261                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
262                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
263                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
264                 { "image",                 required_argument, NULL, 'i'                   },
265                 {}
266         };
267
268         int c, r;
269         uint64_t plus = 0, minus = 0;
270
271         assert(argc >= 0);
272         assert(argv);
273
274         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
275
276                 switch (c) {
277
278                 case 'h':
279                         return help();
280
281                 case ARG_VERSION:
282                         puts(PACKAGE_STRING);
283                         puts(SYSTEMD_FEATURES);
284                         return 0;
285
286                 case 'D':
287                         free(arg_directory);
288                         arg_directory = canonicalize_file_name(optarg);
289                         if (!arg_directory) {
290                                 log_error("Invalid root directory: %m");
291                                 return -ENOMEM;
292                         }
293
294                         break;
295
296                 case 'i':
297                         arg_image = optarg;
298                         break;
299
300                 case 'u':
301                         free(arg_user);
302                         arg_user = strdup(optarg);
303                         if (!arg_user)
304                                 return log_oom();
305
306                         break;
307
308                 case ARG_NETWORK_BRIDGE:
309                         arg_network_bridge = optarg;
310
311                         /* fall through */
312
313                 case ARG_NETWORK_VETH:
314                         arg_network_veth = true;
315                         arg_private_network = true;
316                         break;
317
318                 case ARG_NETWORK_INTERFACE:
319                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
320                                 return log_oom();
321
322                         arg_private_network = true;
323                         break;
324
325                 case ARG_NETWORK_MACVLAN:
326                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
327                                 return log_oom();
328
329                         /* fall through */
330
331                 case ARG_PRIVATE_NETWORK:
332                         arg_private_network = true;
333                         break;
334
335                 case 'b':
336                         arg_boot = true;
337                         break;
338
339                 case ARG_UUID:
340                         r = sd_id128_from_string(optarg, &arg_uuid);
341                         if (r < 0) {
342                                 log_error("Invalid UUID: %s", optarg);
343                                 return r;
344                         }
345                         break;
346
347                 case 'S':
348                         arg_slice = optarg;
349                         break;
350
351                 case 'M':
352                         if (isempty(optarg)) {
353                                 free(arg_machine);
354                                 arg_machine = NULL;
355                         } else {
356
357                                 if (!hostname_is_valid(optarg)) {
358                                         log_error("Invalid machine name: %s", optarg);
359                                         return -EINVAL;
360                                 }
361
362                                 free(arg_machine);
363                                 arg_machine = strdup(optarg);
364                                 if (!arg_machine)
365                                         return log_oom();
366
367                                 break;
368                         }
369
370                 case 'Z':
371                         arg_selinux_context = optarg;
372                         break;
373
374                 case 'L':
375                         arg_selinux_apifs_context = optarg;
376                         break;
377
378                 case ARG_READ_ONLY:
379                         arg_read_only = true;
380                         break;
381
382                 case ARG_CAPABILITY:
383                 case ARG_DROP_CAPABILITY: {
384                         char *state, *word;
385                         size_t length;
386
387                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
388                                 _cleanup_free_ char *t;
389                                 cap_value_t cap;
390
391                                 t = strndup(word, length);
392                                 if (!t)
393                                         return log_oom();
394
395                                 if (streq(t, "all")) {
396                                         if (c == ARG_CAPABILITY)
397                                                 plus = (uint64_t) -1;
398                                         else
399                                                 minus = (uint64_t) -1;
400                                 } else {
401                                         if (cap_from_name(t, &cap) < 0) {
402                                                 log_error("Failed to parse capability %s.", t);
403                                                 return -EINVAL;
404                                         }
405
406                                         if (c == ARG_CAPABILITY)
407                                                 plus |= 1ULL << (uint64_t) cap;
408                                         else
409                                                 minus |= 1ULL << (uint64_t) cap;
410                                 }
411                         }
412
413                         break;
414                 }
415
416                 case 'j':
417                         arg_link_journal = LINK_GUEST;
418                         break;
419
420                 case ARG_LINK_JOURNAL:
421                         if (streq(optarg, "auto"))
422                                 arg_link_journal = LINK_AUTO;
423                         else if (streq(optarg, "no"))
424                                 arg_link_journal = LINK_NO;
425                         else if (streq(optarg, "guest"))
426                                 arg_link_journal = LINK_GUEST;
427                         else if (streq(optarg, "host"))
428                                 arg_link_journal = LINK_HOST;
429                         else {
430                                 log_error("Failed to parse link journal mode %s", optarg);
431                                 return -EINVAL;
432                         }
433
434                         break;
435
436                 case ARG_BIND:
437                 case ARG_BIND_RO: {
438                         _cleanup_free_ char *a = NULL, *b = NULL;
439                         char *e;
440                         char ***x;
441
442                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
443
444                         e = strchr(optarg, ':');
445                         if (e) {
446                                 a = strndup(optarg, e - optarg);
447                                 b = strdup(e + 1);
448                         } else {
449                                 a = strdup(optarg);
450                                 b = strdup(optarg);
451                         }
452
453                         if (!a || !b)
454                                 return log_oom();
455
456                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
457                                 log_error("Invalid bind mount specification: %s", optarg);
458                                 return -EINVAL;
459                         }
460
461                         r = strv_extend(x, a);
462                         if (r < 0)
463                                 return log_oom();
464
465                         r = strv_extend(x, b);
466                         if (r < 0)
467                                 return log_oom();
468
469                         break;
470                 }
471
472                 case ARG_SETENV: {
473                         char **n;
474
475                         if (!env_assignment_is_valid(optarg)) {
476                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
477                                 return -EINVAL;
478                         }
479
480                         n = strv_env_set(arg_setenv, optarg);
481                         if (!n)
482                                 return log_oom();
483
484                         strv_free(arg_setenv);
485                         arg_setenv = n;
486                         break;
487                 }
488
489                 case 'q':
490                         arg_quiet = true;
491                         break;
492
493                 case ARG_SHARE_SYSTEM:
494                         arg_share_system = true;
495                         break;
496
497                 case ARG_REGISTER:
498                         r = parse_boolean(optarg);
499                         if (r < 0) {
500                                 log_error("Failed to parse --register= argument: %s", optarg);
501                                 return r;
502                         }
503
504                         arg_register = r;
505                         break;
506
507                 case ARG_KEEP_UNIT:
508                         arg_keep_unit = true;
509                         break;
510
511                 case ARG_PERSONALITY:
512
513                         arg_personality = personality_from_string(optarg);
514                         if (arg_personality == 0xffffffffLU) {
515                                 log_error("Unknown or unsupported personality '%s'.", optarg);
516                                 return -EINVAL;
517                         }
518
519                         break;
520
521                 case '?':
522                         return -EINVAL;
523
524                 default:
525                         assert_not_reached("Unhandled option");
526                 }
527         }
528
529         if (arg_share_system)
530                 arg_register = false;
531
532         if (arg_boot && arg_share_system) {
533                 log_error("--boot and --share-system may not be combined.");
534                 return -EINVAL;
535         }
536
537         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
538                 log_error("--keep-unit may not be used when invoked from a user session.");
539                 return -EINVAL;
540         }
541
542         if (arg_directory && arg_image) {
543                 log_error("--directory= and --image= may not be combined.");
544                 return -EINVAL;
545         }
546
547         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
548
549         return 1;
550 }
551
552 static int mount_all(const char *dest) {
553
554         typedef struct MountPoint {
555                 const char *what;
556                 const char *where;
557                 const char *type;
558                 const char *options;
559                 unsigned long flags;
560                 bool fatal;
561         } MountPoint;
562
563         static const MountPoint mount_table[] = {
564                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
565                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
566                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
567                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
568                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
569                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
570                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
571                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
572 #ifdef HAVE_SELINUX
573                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
574                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
575 #endif
576         };
577
578         unsigned k;
579         int r = 0;
580
581         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
582                 _cleanup_free_ char *where = NULL;
583 #ifdef HAVE_SELINUX
584                 _cleanup_free_ char *options = NULL;
585 #endif
586                 const char *o;
587                 int t;
588
589                 where = strjoin(dest, "/", mount_table[k].where, NULL);
590                 if (!where)
591                         return log_oom();
592
593                 t = path_is_mount_point(where, true);
594                 if (t < 0) {
595                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
596
597                         if (r == 0)
598                                 r = t;
599
600                         continue;
601                 }
602
603                 /* Skip this entry if it is not a remount. */
604                 if (mount_table[k].what && t > 0)
605                         continue;
606
607                 mkdir_p(where, 0755);
608
609 #ifdef HAVE_SELINUX
610                 if (arg_selinux_apifs_context &&
611                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
612                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
613                         if (!options)
614                                 return log_oom();
615
616                         o = options;
617                 } else
618 #endif
619                         o = mount_table[k].options;
620
621
622                 if (mount(mount_table[k].what,
623                           where,
624                           mount_table[k].type,
625                           mount_table[k].flags,
626                           o) < 0 &&
627                     mount_table[k].fatal) {
628
629                         log_error("mount(%s) failed: %m", where);
630
631                         if (r == 0)
632                                 r = -errno;
633                 }
634         }
635
636         return r;
637 }
638
639 static int mount_binds(const char *dest, char **l, bool ro) {
640         char **x, **y;
641
642         STRV_FOREACH_PAIR(x, y, l) {
643                 char *where;
644                 struct stat source_st, dest_st;
645                 int r;
646
647                 if (stat(*x, &source_st) < 0) {
648                         log_error("Failed to stat %s: %m", *x);
649                         return -errno;
650                 }
651
652                 where = strappenda(dest, *y);
653                 r = stat(where, &dest_st);
654                 if (r == 0) {
655                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
656                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
657                                                 *x, where);
658                                 return -EINVAL;
659                         }
660                 } else if (errno == ENOENT) {
661                         r = mkdir_parents_label(where, 0755);
662                         if (r < 0) {
663                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
664                                 return r;
665                         }
666                 } else {
667                         log_error("Failed to bind mount %s: %m", *x);
668                         return -errno;
669                 }
670                 /* Create the mount point, but be conservative -- refuse to create block
671                 * and char devices. */
672                 if (S_ISDIR(source_st.st_mode))
673                         mkdir_label(where, 0755);
674                 else if (S_ISFIFO(source_st.st_mode))
675                         mkfifo(where, 0644);
676                 else if (S_ISSOCK(source_st.st_mode))
677                         mknod(where, 0644 | S_IFSOCK, 0);
678                 else if (S_ISREG(source_st.st_mode))
679                         touch(where);
680                 else {
681                         log_error("Refusing to create mountpoint for file: %s", *x);
682                         return -ENOTSUP;
683                 }
684
685                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
686                         log_error("mount(%s) failed: %m", where);
687                         return -errno;
688                 }
689
690                 if (ro) {
691                         r = bind_remount_recursive(where, true);
692                         if (r < 0) {
693                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
694                                 return r;
695                         }
696                 }
697         }
698
699         return 0;
700 }
701
702 static int setup_timezone(const char *dest) {
703         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
704         char *z, *y;
705         int r;
706
707         assert(dest);
708
709         /* Fix the timezone, if possible */
710         r = readlink_malloc("/etc/localtime", &p);
711         if (r < 0) {
712                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
713                 return 0;
714         }
715
716         z = path_startswith(p, "../usr/share/zoneinfo/");
717         if (!z)
718                 z = path_startswith(p, "/usr/share/zoneinfo/");
719         if (!z) {
720                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
721                 return 0;
722         }
723
724         where = strappend(dest, "/etc/localtime");
725         if (!where)
726                 return log_oom();
727
728         r = readlink_malloc(where, &q);
729         if (r >= 0) {
730                 y = path_startswith(q, "../usr/share/zoneinfo/");
731                 if (!y)
732                         y = path_startswith(q, "/usr/share/zoneinfo/");
733
734
735                 /* Already pointing to the right place? Then do nothing .. */
736                 if (y && streq(y, z))
737                         return 0;
738         }
739
740         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
741         if (!check)
742                 return log_oom();
743
744         if (access(check, F_OK) < 0) {
745                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
746                 return 0;
747         }
748
749         what = strappend("../usr/share/zoneinfo/", z);
750         if (!what)
751                 return log_oom();
752
753         unlink(where);
754         if (symlink(what, where) < 0) {
755                 log_error("Failed to correct timezone of container: %m");
756                 return 0;
757         }
758
759         return 0;
760 }
761
762 static int setup_resolv_conf(const char *dest) {
763         char _cleanup_free_ *where = NULL;
764
765         assert(dest);
766
767         if (arg_private_network)
768                 return 0;
769
770         /* Fix resolv.conf, if possible */
771         where = strappend(dest, "/etc/resolv.conf");
772         if (!where)
773                 return log_oom();
774
775         /* We don't really care for the results of this really. If it
776          * fails, it fails, but meh... */
777         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
778
779         return 0;
780 }
781
782 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
783
784         snprintf(s, 37,
785                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
786                  SD_ID128_FORMAT_VAL(id));
787
788         return s;
789 }
790
791 static int setup_boot_id(const char *dest) {
792         _cleanup_free_ char *from = NULL, *to = NULL;
793         sd_id128_t rnd = {};
794         char as_uuid[37];
795         int r;
796
797         assert(dest);
798
799         if (arg_share_system)
800                 return 0;
801
802         /* Generate a new randomized boot ID, so that each boot-up of
803          * the container gets a new one */
804
805         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
806         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
807         if (!from || !to)
808                 return log_oom();
809
810         r = sd_id128_randomize(&rnd);
811         if (r < 0) {
812                 log_error("Failed to generate random boot id: %s", strerror(-r));
813                 return r;
814         }
815
816         id128_format_as_uuid(rnd, as_uuid);
817
818         r = write_string_file(from, as_uuid);
819         if (r < 0) {
820                 log_error("Failed to write boot id: %s", strerror(-r));
821                 return r;
822         }
823
824         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
825                 log_error("Failed to bind mount boot id: %m");
826                 r = -errno;
827         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
828                 log_warning("Failed to make boot id read-only: %m");
829
830         unlink(from);
831         return r;
832 }
833
834 static int copy_devnodes(const char *dest) {
835
836         static const char devnodes[] =
837                 "null\0"
838                 "zero\0"
839                 "full\0"
840                 "random\0"
841                 "urandom\0"
842                 "tty\0";
843
844         const char *d;
845         int r = 0;
846         _cleanup_umask_ mode_t u;
847
848         assert(dest);
849
850         u = umask(0000);
851
852         NULSTR_FOREACH(d, devnodes) {
853                 _cleanup_free_ char *from = NULL, *to = NULL;
854                 struct stat st;
855
856                 from = strappend("/dev/", d);
857                 to = strjoin(dest, "/dev/", d, NULL);
858                 if (!from || !to)
859                         return log_oom();
860
861                 if (stat(from, &st) < 0) {
862
863                         if (errno != ENOENT) {
864                                 log_error("Failed to stat %s: %m", from);
865                                 return -errno;
866                         }
867
868                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
869
870                         log_error("%s is not a char or block device, cannot copy", from);
871                         return -EIO;
872
873                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
874
875                         log_error("mknod(%s) failed: %m", dest);
876                         return  -errno;
877                 }
878         }
879
880         return r;
881 }
882
883 static int setup_ptmx(const char *dest) {
884         _cleanup_free_ char *p = NULL;
885
886         p = strappend(dest, "/dev/ptmx");
887         if (!p)
888                 return log_oom();
889
890         if (symlink("pts/ptmx", p) < 0) {
891                 log_error("Failed to create /dev/ptmx symlink: %m");
892                 return -errno;
893         }
894
895         return 0;
896 }
897
898 static int setup_dev_console(const char *dest, const char *console) {
899         _cleanup_umask_ mode_t u;
900         const char *to;
901         struct stat st;
902         int r;
903
904         assert(dest);
905         assert(console);
906
907         u = umask(0000);
908
909         if (stat("/dev/null", &st) < 0) {
910                 log_error("Failed to stat /dev/null: %m");
911                 return -errno;
912         }
913
914         r = chmod_and_chown(console, 0600, 0, 0);
915         if (r < 0) {
916                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
917                 return r;
918         }
919
920         /* We need to bind mount the right tty to /dev/console since
921          * ptys can only exist on pts file systems. To have something
922          * to bind mount things on we create a device node first, and
923          * use /dev/null for that since we the cgroups device policy
924          * allows us to create that freely, while we cannot create
925          * /dev/console. (Note that the major minor doesn't actually
926          * matter here, since we mount it over anyway). */
927
928         to = strappenda(dest, "/dev/console");
929         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
930                 log_error("mknod() for /dev/console failed: %m");
931                 return -errno;
932         }
933
934         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
935                 log_error("Bind mount for /dev/console failed: %m");
936                 return -errno;
937         }
938
939         return 0;
940 }
941
942 static int setup_kmsg(const char *dest, int kmsg_socket) {
943         _cleanup_free_ char *from = NULL, *to = NULL;
944         int r, fd, k;
945         _cleanup_umask_ mode_t u;
946         union {
947                 struct cmsghdr cmsghdr;
948                 uint8_t buf[CMSG_SPACE(sizeof(int))];
949         } control = {};
950         struct msghdr mh = {
951                 .msg_control = &control,
952                 .msg_controllen = sizeof(control),
953         };
954         struct cmsghdr *cmsg;
955
956         assert(dest);
957         assert(kmsg_socket >= 0);
958
959         u = umask(0000);
960
961         /* We create the kmsg FIFO as /dev/kmsg, but immediately
962          * delete it after bind mounting it to /proc/kmsg. While FIFOs
963          * on the reading side behave very similar to /proc/kmsg,
964          * their writing side behaves differently from /dev/kmsg in
965          * that writing blocks when nothing is reading. In order to
966          * avoid any problems with containers deadlocking due to this
967          * we simply make /dev/kmsg unavailable to the container. */
968         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
969             asprintf(&to, "%s/proc/kmsg", dest) < 0)
970                 return log_oom();
971
972         if (mkfifo(from, 0600) < 0) {
973                 log_error("mkfifo() for /dev/kmsg failed: %m");
974                 return -errno;
975         }
976
977         r = chmod_and_chown(from, 0600, 0, 0);
978         if (r < 0) {
979                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
980                 return r;
981         }
982
983         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
984                 log_error("Bind mount for /proc/kmsg failed: %m");
985                 return -errno;
986         }
987
988         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
989         if (fd < 0) {
990                 log_error("Failed to open fifo: %m");
991                 return -errno;
992         }
993
994         cmsg = CMSG_FIRSTHDR(&mh);
995         cmsg->cmsg_level = SOL_SOCKET;
996         cmsg->cmsg_type = SCM_RIGHTS;
997         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
998         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
999
1000         mh.msg_controllen = cmsg->cmsg_len;
1001
1002         /* Store away the fd in the socket, so that it stays open as
1003          * long as we run the child */
1004         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1005         safe_close(fd);
1006
1007         if (k < 0) {
1008                 log_error("Failed to send FIFO fd: %m");
1009                 return -errno;
1010         }
1011
1012         /* And now make the FIFO unavailable as /dev/kmsg... */
1013         unlink(from);
1014         return 0;
1015 }
1016
1017 static int setup_hostname(void) {
1018
1019         if (arg_share_system)
1020                 return 0;
1021
1022         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1023                 return -errno;
1024
1025         return 0;
1026 }
1027
1028 static int setup_journal(const char *directory) {
1029         sd_id128_t machine_id, this_id;
1030         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1031         char *id;
1032         int r;
1033
1034         p = strappend(directory, "/etc/machine-id");
1035         if (!p)
1036                 return log_oom();
1037
1038         r = read_one_line_file(p, &b);
1039         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1040                 return 0;
1041         else if (r < 0) {
1042                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1043                 return r;
1044         }
1045
1046         id = strstrip(b);
1047         if (isempty(id) && arg_link_journal == LINK_AUTO)
1048                 return 0;
1049
1050         /* Verify validity */
1051         r = sd_id128_from_string(id, &machine_id);
1052         if (r < 0) {
1053                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1054                 return r;
1055         }
1056
1057         r = sd_id128_get_machine(&this_id);
1058         if (r < 0) {
1059                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1060                 return r;
1061         }
1062
1063         if (sd_id128_equal(machine_id, this_id)) {
1064                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1065                          "Host and machine ids are equal (%s): refusing to link journals", id);
1066                 if (arg_link_journal == LINK_AUTO)
1067                         return 0;
1068                 return
1069                         -EEXIST;
1070         }
1071
1072         if (arg_link_journal == LINK_NO)
1073                 return 0;
1074
1075         free(p);
1076         p = strappend("/var/log/journal/", id);
1077         q = strjoin(directory, "/var/log/journal/", id, NULL);
1078         if (!p || !q)
1079                 return log_oom();
1080
1081         if (path_is_mount_point(p, false) > 0) {
1082                 if (arg_link_journal != LINK_AUTO) {
1083                         log_error("%s: already a mount point, refusing to use for journal", p);
1084                         return -EEXIST;
1085                 }
1086
1087                 return 0;
1088         }
1089
1090         if (path_is_mount_point(q, false) > 0) {
1091                 if (arg_link_journal != LINK_AUTO) {
1092                         log_error("%s: already a mount point, refusing to use for journal", q);
1093                         return -EEXIST;
1094                 }
1095
1096                 return 0;
1097         }
1098
1099         r = readlink_and_make_absolute(p, &d);
1100         if (r >= 0) {
1101                 if ((arg_link_journal == LINK_GUEST ||
1102                      arg_link_journal == LINK_AUTO) &&
1103                     path_equal(d, q)) {
1104
1105                         r = mkdir_p(q, 0755);
1106                         if (r < 0)
1107                                 log_warning("failed to create directory %s: %m", q);
1108                         return 0;
1109                 }
1110
1111                 if (unlink(p) < 0) {
1112                         log_error("Failed to remove symlink %s: %m", p);
1113                         return -errno;
1114                 }
1115         } else if (r == -EINVAL) {
1116
1117                 if (arg_link_journal == LINK_GUEST &&
1118                     rmdir(p) < 0) {
1119
1120                         if (errno == ENOTDIR) {
1121                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1122                                 return r;
1123                         } else {
1124                                 log_error("Failed to remove %s: %m", p);
1125                                 return -errno;
1126                         }
1127                 }
1128         } else if (r != -ENOENT) {
1129                 log_error("readlink(%s) failed: %m", p);
1130                 return r;
1131         }
1132
1133         if (arg_link_journal == LINK_GUEST) {
1134
1135                 if (symlink(q, p) < 0) {
1136                         log_error("Failed to symlink %s to %s: %m", q, p);
1137                         return -errno;
1138                 }
1139
1140                 r = mkdir_p(q, 0755);
1141                 if (r < 0)
1142                         log_warning("failed to create directory %s: %m", q);
1143                 return 0;
1144         }
1145
1146         if (arg_link_journal == LINK_HOST) {
1147                 r = mkdir_p(p, 0755);
1148                 if (r < 0) {
1149                         log_error("Failed to create %s: %m", p);
1150                         return r;
1151                 }
1152
1153         } else if (access(p, F_OK) < 0)
1154                 return 0;
1155
1156         if (dir_is_empty(q) == 0)
1157                 log_warning("%s is not empty, proceeding anyway.", q);
1158
1159         r = mkdir_p(q, 0755);
1160         if (r < 0) {
1161                 log_error("Failed to create %s: %m", q);
1162                 return r;
1163         }
1164
1165         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1166                 log_error("Failed to bind mount journal from host into guest: %m");
1167                 return -errno;
1168         }
1169
1170         return 0;
1171 }
1172
1173 static int setup_kdbus(const char *dest, const char *path) {
1174         const char *p;
1175
1176         if (!path)
1177                 return 0;
1178
1179         p = strappenda(dest, "/dev/kdbus");
1180         if (mkdir(p, 0755) < 0) {
1181                 log_error("Failed to create kdbus path: %m");
1182                 return  -errno;
1183         }
1184
1185         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1186                 log_error("Failed to mount kdbus domain path: %m");
1187                 return -errno;
1188         }
1189
1190         return 0;
1191 }
1192
1193 static int drop_capabilities(void) {
1194         return capability_bounding_set_drop(~arg_retain, false);
1195 }
1196
1197 static int register_machine(pid_t pid) {
1198         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1199         _cleanup_bus_unref_ sd_bus *bus = NULL;
1200         int r;
1201
1202         if (!arg_register)
1203                 return 0;
1204
1205         r = sd_bus_default_system(&bus);
1206         if (r < 0) {
1207                 log_error("Failed to open system bus: %s", strerror(-r));
1208                 return r;
1209         }
1210
1211         if (arg_keep_unit) {
1212                 r = sd_bus_call_method(
1213                                 bus,
1214                                 "org.freedesktop.machine1",
1215                                 "/org/freedesktop/machine1",
1216                                 "org.freedesktop.machine1.Manager",
1217                                 "RegisterMachine",
1218                                 &error,
1219                                 NULL,
1220                                 "sayssus",
1221                                 arg_machine,
1222                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1223                                 "nspawn",
1224                                 "container",
1225                                 (uint32_t) pid,
1226                                 strempty(arg_directory));
1227         } else {
1228                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1229
1230                 r = sd_bus_message_new_method_call(
1231                                 bus,
1232                                 &m,
1233                                 "org.freedesktop.machine1",
1234                                 "/org/freedesktop/machine1",
1235                                 "org.freedesktop.machine1.Manager",
1236                                 "CreateMachine");
1237                 if (r < 0) {
1238                         log_error("Failed to create message: %s", strerror(-r));
1239                         return r;
1240                 }
1241
1242                 r = sd_bus_message_append(
1243                                 m,
1244                                 "sayssus",
1245                                 arg_machine,
1246                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1247                                 "nspawn",
1248                                 "container",
1249                                 (uint32_t) pid,
1250                                 strempty(arg_directory));
1251                 if (r < 0) {
1252                         log_error("Failed to append message arguments: %s", strerror(-r));
1253                         return r;
1254                 }
1255
1256                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1257                 if (r < 0) {
1258                         log_error("Failed to open container: %s", strerror(-r));
1259                         return r;
1260                 }
1261
1262                 if (!isempty(arg_slice)) {
1263                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1264                         if (r < 0) {
1265                                 log_error("Failed to append slice: %s", strerror(-r));
1266                                 return r;
1267                         }
1268                 }
1269
1270                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1271                 if (r < 0) {
1272                         log_error("Failed to add device policy: %s", strerror(-r));
1273                         return r;
1274                 }
1275
1276                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1277                                           /* Allow the container to
1278                                            * access and create the API
1279                                            * device nodes, so that
1280                                            * PrivateDevices= in the
1281                                            * container can work
1282                                            * fine */
1283                                           "/dev/null", "rwm",
1284                                           "/dev/zero", "rwm",
1285                                           "/dev/full", "rwm",
1286                                           "/dev/random", "rwm",
1287                                           "/dev/urandom", "rwm",
1288                                           "/dev/tty", "rwm",
1289                                           /* Allow the container
1290                                            * access to ptys. However,
1291                                            * do not permit the
1292                                            * container to ever create
1293                                            * these device nodes. */
1294                                           "/dev/pts/ptmx", "rw",
1295                                           "char-pts", "rw",
1296                                           /* Allow the container
1297                                            * access to all kdbus
1298                                            * devices. Again, the
1299                                            * container cannot create
1300                                            * these nodes, only use
1301                                            * them. We use a pretty
1302                                            * open match here, so that
1303                                            * the kernel API can still
1304                                            * change. */
1305                                           "char-kdbus", "rw",
1306                                           "char-kdbus/*", "rw");
1307                 if (r < 0) {
1308                         log_error("Failed to add device whitelist: %s", strerror(-r));
1309                         return r;
1310                 }
1311
1312                 r = sd_bus_message_close_container(m);
1313                 if (r < 0) {
1314                         log_error("Failed to close container: %s", strerror(-r));
1315                         return r;
1316                 }
1317
1318                 r = sd_bus_call(bus, m, 0, &error, NULL);
1319         }
1320
1321         if (r < 0) {
1322                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1323                 return r;
1324         }
1325
1326         return 0;
1327 }
1328
1329 static int terminate_machine(pid_t pid) {
1330         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1331         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1332         _cleanup_bus_unref_ sd_bus *bus = NULL;
1333         const char *path;
1334         int r;
1335
1336         if (!arg_register)
1337                 return 0;
1338
1339         r = sd_bus_default_system(&bus);
1340         if (r < 0) {
1341                 log_error("Failed to open system bus: %s", strerror(-r));
1342                 return r;
1343         }
1344
1345         r = sd_bus_call_method(
1346                         bus,
1347                         "org.freedesktop.machine1",
1348                         "/org/freedesktop/machine1",
1349                         "org.freedesktop.machine1.Manager",
1350                         "GetMachineByPID",
1351                         &error,
1352                         &reply,
1353                         "u",
1354                         (uint32_t) pid);
1355         if (r < 0) {
1356                 /* Note that the machine might already have been
1357                  * cleaned up automatically, hence don't consider it a
1358                  * failure if we cannot get the machine object. */
1359                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1360                 return 0;
1361         }
1362
1363         r = sd_bus_message_read(reply, "o", &path);
1364         if (r < 0)
1365                 return bus_log_parse_error(r);
1366
1367         r = sd_bus_call_method(
1368                         bus,
1369                         "org.freedesktop.machine1",
1370                         path,
1371                         "org.freedesktop.machine1.Machine",
1372                         "Terminate",
1373                         &error,
1374                         NULL,
1375                         NULL);
1376         if (r < 0) {
1377                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1378                 return 0;
1379         }
1380
1381         return 0;
1382 }
1383
1384 static int reset_audit_loginuid(void) {
1385         _cleanup_free_ char *p = NULL;
1386         int r;
1387
1388         if (arg_share_system)
1389                 return 0;
1390
1391         r = read_one_line_file("/proc/self/loginuid", &p);
1392         if (r == -ENOENT)
1393                 return 0;
1394         if (r < 0) {
1395                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1396                 return r;
1397         }
1398
1399         /* Already reset? */
1400         if (streq(p, "4294967295"))
1401                 return 0;
1402
1403         r = write_string_file("/proc/self/loginuid", "4294967295");
1404         if (r < 0) {
1405                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1406                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1407                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1408                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1409                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1410
1411                 sleep(5);
1412         }
1413
1414         return 0;
1415 }
1416
1417 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1418
1419 static int get_mac(struct ether_addr *mac) {
1420         int r;
1421
1422         uint8_t result[8];
1423         size_t l, sz;
1424         uint8_t *v;
1425
1426         l = strlen(arg_machine);
1427         sz = sizeof(sd_id128_t) + l;
1428         v = alloca(sz);
1429
1430         /* fetch some persistent data unique to the host */
1431         r = sd_id128_get_machine((sd_id128_t*) v);
1432         if (r < 0)
1433                 return r;
1434
1435         /* combine with some data unique (on this host) to this
1436          * container instance */
1437         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1438
1439         /* Let's hash the host machine ID plus the container name. We
1440          * use a fixed, but originally randomly created hash key here. */
1441         siphash24(result, v, sz, HASH_KEY.bytes);
1442
1443         assert_cc(ETH_ALEN <= sizeof(result));
1444         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1445
1446         /* see eth_random_addr in the kernel */
1447         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1448         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1449
1450         return 0;
1451 }
1452
1453 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1454         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1455         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1456         struct ether_addr mac;
1457         int r;
1458
1459         if (!arg_private_network)
1460                 return 0;
1461
1462         if (!arg_network_veth)
1463                 return 0;
1464
1465         /* Use two different interface name prefixes depending whether
1466          * we are in bridge mode or not. */
1467         if (arg_network_bridge)
1468                 memcpy(iface_name, "vb-", 3);
1469         else
1470                 memcpy(iface_name, "ve-", 3);
1471         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1472
1473         r = get_mac(&mac);
1474         if (r < 0) {
1475                 log_error("Failed to generate predictable MAC address for host0");
1476                 return r;
1477         }
1478
1479         r = sd_rtnl_open(&rtnl, 0);
1480         if (r < 0) {
1481                 log_error("Failed to connect to netlink: %s", strerror(-r));
1482                 return r;
1483         }
1484
1485         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1486         if (r < 0) {
1487                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1488                 return r;
1489         }
1490
1491         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1492         if (r < 0) {
1493                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1494                 return r;
1495         }
1496
1497         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1498         if (r < 0) {
1499                 log_error("Failed to open netlink container: %s", strerror(-r));
1500                 return r;
1501         }
1502
1503         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1504         if (r < 0) {
1505                 log_error("Failed to open netlink container: %s", strerror(-r));
1506                 return r;
1507         }
1508
1509         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1510         if (r < 0) {
1511                 log_error("Failed to open netlink container: %s", strerror(-r));
1512                 return r;
1513         }
1514
1515         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1516         if (r < 0) {
1517                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1518                 return r;
1519         }
1520
1521         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1522         if (r < 0) {
1523                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1524                 return r;
1525         }
1526
1527         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1528         if (r < 0) {
1529                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1530                 return r;
1531         }
1532
1533         r = sd_rtnl_message_close_container(m);
1534         if (r < 0) {
1535                 log_error("Failed to close netlink container: %s", strerror(-r));
1536                 return r;
1537         }
1538
1539         r = sd_rtnl_message_close_container(m);
1540         if (r < 0) {
1541                 log_error("Failed to close netlink container: %s", strerror(-r));
1542                 return r;
1543         }
1544
1545         r = sd_rtnl_message_close_container(m);
1546         if (r < 0) {
1547                 log_error("Failed to close netlink container: %s", strerror(-r));
1548                 return r;
1549         }
1550
1551         r = sd_rtnl_call(rtnl, m, 0, NULL);
1552         if (r < 0) {
1553                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1554                 return r;
1555         }
1556
1557         return 0;
1558 }
1559
1560 static int setup_bridge(const char veth_name[]) {
1561         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1562         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1563         int r, bridge;
1564
1565         if (!arg_private_network)
1566                 return 0;
1567
1568         if (!arg_network_veth)
1569                 return 0;
1570
1571         if (!arg_network_bridge)
1572                 return 0;
1573
1574         bridge = (int) if_nametoindex(arg_network_bridge);
1575         if (bridge <= 0) {
1576                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1577                 return -errno;
1578         }
1579
1580         r = sd_rtnl_open(&rtnl, 0);
1581         if (r < 0) {
1582                 log_error("Failed to connect to netlink: %s", strerror(-r));
1583                 return r;
1584         }
1585
1586         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1587         if (r < 0) {
1588                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1589                 return r;
1590         }
1591
1592         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1593         if (r < 0) {
1594                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1595                 return r;
1596         }
1597
1598         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1599         if (r < 0) {
1600                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1601                 return r;
1602         }
1603
1604         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1605         if (r < 0) {
1606                 log_error("Failed to add netlink master field: %s", strerror(-r));
1607                 return r;
1608         }
1609
1610         r = sd_rtnl_call(rtnl, m, 0, NULL);
1611         if (r < 0) {
1612                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1613                 return r;
1614         }
1615
1616         return 0;
1617 }
1618
1619 static int parse_interface(struct udev *udev, const char *name) {
1620         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1621         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1622         int ifi;
1623
1624         ifi = (int) if_nametoindex(name);
1625         if (ifi <= 0) {
1626                 log_error("Failed to resolve interface %s: %m", name);
1627                 return -errno;
1628         }
1629
1630         sprintf(ifi_str, "n%i", ifi);
1631         d = udev_device_new_from_device_id(udev, ifi_str);
1632         if (!d) {
1633                 log_error("Failed to get udev device for interface %s: %m", name);
1634                 return -errno;
1635         }
1636
1637         if (udev_device_get_is_initialized(d) <= 0) {
1638                 log_error("Network interface %s is not initialized yet.", name);
1639                 return -EBUSY;
1640         }
1641
1642         return ifi;
1643 }
1644
1645 static int move_network_interfaces(pid_t pid) {
1646         _cleanup_udev_unref_ struct udev *udev = NULL;
1647         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1648         char **i;
1649         int r;
1650
1651         if (!arg_private_network)
1652                 return 0;
1653
1654         if (strv_isempty(arg_network_interfaces))
1655                 return 0;
1656
1657         r = sd_rtnl_open(&rtnl, 0);
1658         if (r < 0) {
1659                 log_error("Failed to connect to netlink: %s", strerror(-r));
1660                 return r;
1661         }
1662
1663         udev = udev_new();
1664         if (!udev) {
1665                 log_error("Failed to connect to udev.");
1666                 return -ENOMEM;
1667         }
1668
1669         STRV_FOREACH(i, arg_network_interfaces) {
1670                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1671                 int ifi;
1672
1673                 ifi = parse_interface(udev, *i);
1674                 if (ifi < 0)
1675                         return ifi;
1676
1677                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1678                 if (r < 0) {
1679                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1680                         return r;
1681                 }
1682
1683                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1684                 if (r < 0) {
1685                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1686                         return r;
1687                 }
1688
1689                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1690                 if (r < 0) {
1691                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1692                         return r;
1693                 }
1694         }
1695
1696         return 0;
1697 }
1698
1699 static int setup_macvlan(pid_t pid) {
1700         _cleanup_udev_unref_ struct udev *udev = NULL;
1701         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1702         char **i;
1703         int r;
1704
1705         if (!arg_private_network)
1706                 return 0;
1707
1708         if (strv_isempty(arg_network_macvlan))
1709                 return 0;
1710
1711         r = sd_rtnl_open(&rtnl, 0);
1712         if (r < 0) {
1713                 log_error("Failed to connect to netlink: %s", strerror(-r));
1714                 return r;
1715         }
1716
1717         udev = udev_new();
1718         if (!udev) {
1719                 log_error("Failed to connect to udev.");
1720                 return -ENOMEM;
1721         }
1722
1723         STRV_FOREACH(i, arg_network_macvlan) {
1724                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1725                 _cleanup_free_ char *n = NULL;
1726                 int ifi;
1727
1728                 ifi = parse_interface(udev, *i);
1729                 if (ifi < 0)
1730                         return ifi;
1731
1732                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1733                 if (r < 0) {
1734                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1735                         return r;
1736                 }
1737
1738                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1739                 if (r < 0) {
1740                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1741                         return r;
1742                 }
1743
1744                 n = strappend("mv-", *i);
1745                 if (!n)
1746                         return log_oom();
1747
1748                 strshorten(n, IFNAMSIZ-1);
1749
1750                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1751                 if (r < 0) {
1752                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1753                         return r;
1754                 }
1755
1756                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1757                 if (r < 0) {
1758                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1759                         return r;
1760                 }
1761
1762                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1763                 if (r < 0) {
1764                         log_error("Failed to open netlink container: %s", strerror(-r));
1765                         return r;
1766                 }
1767
1768                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1769                 if (r < 0) {
1770                         log_error("Failed to open netlink container: %s", strerror(-r));
1771                         return r;
1772                 }
1773
1774                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1775                 if (r < 0) {
1776                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1777                         return r;
1778                 }
1779
1780                 r = sd_rtnl_message_close_container(m);
1781                 if (r < 0) {
1782                         log_error("Failed to close netlink container: %s", strerror(-r));
1783                         return r;
1784                 }
1785
1786                 r = sd_rtnl_message_close_container(m);
1787                 if (r < 0) {
1788                         log_error("Failed to close netlink container: %s", strerror(-r));
1789                         return r;
1790                 }
1791
1792                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1793                 if (r < 0) {
1794                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1795                         return r;
1796                 }
1797         }
1798
1799         return 0;
1800 }
1801
1802 static int audit_still_doesnt_work_in_containers(void) {
1803
1804 #ifdef HAVE_SECCOMP
1805         scmp_filter_ctx seccomp;
1806         int r;
1807
1808         /*
1809            Audit is broken in containers, much of the userspace audit
1810            hookup will fail if running inside a container. We don't
1811            care and just turn off creation of audit sockets.
1812
1813            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1814            with EAFNOSUPPORT which audit userspace uses as indication
1815            that audit is disabled in the kernel.
1816          */
1817
1818         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1819         if (!seccomp)
1820                 return log_oom();
1821
1822         r = seccomp_add_secondary_archs(seccomp);
1823         if (r < 0) {
1824                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1825                 goto finish;
1826         }
1827
1828         r = seccomp_rule_add(
1829                         seccomp,
1830                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1831                         SCMP_SYS(socket),
1832                         2,
1833                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1834                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1835         if (r < 0) {
1836                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1837                 goto finish;
1838         }
1839
1840         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1841         if (r < 0) {
1842                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1843                 goto finish;
1844         }
1845
1846         r = seccomp_load(seccomp);
1847         if (r < 0)
1848                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1849
1850 finish:
1851         seccomp_release(seccomp);
1852         return r;
1853 #else
1854         return 0;
1855 #endif
1856
1857 }
1858
1859 static int setup_image(char **device_path, int *loop_nr) {
1860         struct loop_info64 info = {
1861                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1862         };
1863         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1864         _cleanup_free_ char* loopdev = NULL;
1865         struct stat st;
1866         int r, nr;
1867
1868         assert(device_path);
1869         assert(loop_nr);
1870
1871         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1872         if (fd < 0) {
1873                 log_error("Failed to open %s: %m", arg_image);
1874                 return -errno;
1875         }
1876
1877         if (fstat(fd, &st) < 0) {
1878                 log_error("Failed to stat %s: %m", arg_image);
1879                 return -errno;
1880         }
1881
1882         if (S_ISBLK(st.st_mode)) {
1883                 char *p;
1884
1885                 p = strdup(arg_image);
1886                 if (!p)
1887                         return log_oom();
1888
1889                 *device_path = p;
1890
1891                 *loop_nr = -1;
1892
1893                 r = fd;
1894                 fd = -1;
1895
1896                 return r;
1897         }
1898
1899         if (!S_ISREG(st.st_mode)) {
1900                 log_error("%s is not a regular file or block device: %m", arg_image);
1901                 return -EINVAL;
1902         }
1903
1904         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1905         if (control < 0) {
1906                 log_error("Failed to open /dev/loop-control: %m");
1907                 return -errno;
1908         }
1909
1910         nr = ioctl(control, LOOP_CTL_GET_FREE);
1911         if (nr < 0) {
1912                 log_error("Failed to allocate loop device: %m");
1913                 return -errno;
1914         }
1915
1916         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1917                 return log_oom();
1918
1919         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1920         if (loop < 0) {
1921                 log_error("Failed to open loop device %s: %m", loopdev);
1922                 return -errno;
1923         }
1924
1925         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1926                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1927                 return -errno;
1928         }
1929
1930         if (arg_read_only)
1931                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1932
1933         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1934                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1935                 return -errno;
1936         }
1937
1938         *device_path = loopdev;
1939         loopdev = NULL;
1940
1941         *loop_nr = nr;
1942
1943         r = loop;
1944         loop = -1;
1945
1946         return r;
1947 }
1948
1949 static int dissect_image(
1950                 int fd,
1951                 char **root_device, bool *root_device_rw,
1952                 char **home_device, bool *home_device_rw,
1953                 char **srv_device, bool *srv_device_rw,
1954                 bool *secondary) {
1955
1956 #ifdef HAVE_BLKID
1957         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1958         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1959         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1960         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1961         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1962         _cleanup_udev_unref_ struct udev *udev = NULL;
1963         struct udev_list_entry *first, *item;
1964         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1965         const char *pttype = NULL;
1966         blkid_partlist pl;
1967         struct stat st;
1968         int r;
1969
1970         assert(fd >= 0);
1971         assert(root_device);
1972         assert(home_device);
1973         assert(srv_device);
1974         assert(secondary);
1975
1976         b = blkid_new_probe();
1977         if (!b)
1978                 return log_oom();
1979
1980         errno = 0;
1981         r = blkid_probe_set_device(b, fd, 0, 0);
1982         if (r != 0) {
1983                 if (errno == 0)
1984                         return log_oom();
1985
1986                 log_error("Failed to set device on blkid probe: %m");
1987                 return -errno;
1988         }
1989
1990         blkid_probe_enable_partitions(b, 1);
1991         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1992
1993         errno = 0;
1994         r = blkid_do_safeprobe(b);
1995         if (r == -2 || r == 1) {
1996                 log_error("Failed to identify any partition table on %s.\n"
1997                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1998                 return -EINVAL;
1999         } else if (r != 0) {
2000                 if (errno == 0)
2001                         errno = EIO;
2002                 log_error("Failed to probe: %m");
2003                 return -errno;
2004         }
2005
2006         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2007         if (!streq_ptr(pttype, "gpt")) {
2008                 log_error("Image %s does not carry a GUID Partition Table.\n"
2009                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2010                 return -EINVAL;
2011         }
2012
2013         errno = 0;
2014         pl = blkid_probe_get_partitions(b);
2015         if (!pl) {
2016                 if (errno == 0)
2017                         return log_oom();
2018
2019                 log_error("Failed to list partitions of %s", arg_image);
2020                 return -errno;
2021         }
2022
2023         udev = udev_new();
2024         if (!udev)
2025                 return log_oom();
2026
2027         if (fstat(fd, &st) < 0) {
2028                 log_error("Failed to stat block device: %m");
2029                 return -errno;
2030         }
2031
2032         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2033         if (!d)
2034                 return log_oom();
2035
2036         e = udev_enumerate_new(udev);
2037         if (!e)
2038                 return log_oom();
2039
2040         r = udev_enumerate_add_match_parent(e, d);
2041         if (r < 0)
2042                 return log_oom();
2043
2044         r = udev_enumerate_scan_devices(e);
2045         if (r < 0) {
2046                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2047                 return r;
2048         }
2049
2050         first = udev_enumerate_get_list_entry(e);
2051         udev_list_entry_foreach(item, first) {
2052                 _cleanup_udev_device_unref_ struct udev_device *q;
2053                 const char *stype, *node;
2054                 unsigned long long flags;
2055                 sd_id128_t type_id;
2056                 blkid_partition pp;
2057                 dev_t qn;
2058                 int nr;
2059
2060                 errno = 0;
2061                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2062                 if (!q) {
2063                         if (!errno)
2064                                 errno = ENOMEM;
2065
2066                         log_error("Failed to get partition device of %s: %m", arg_image);
2067                         return -errno;
2068                 }
2069
2070                 qn = udev_device_get_devnum(q);
2071                 if (major(qn) == 0)
2072                         continue;
2073
2074                 if (st.st_rdev == qn)
2075                         continue;
2076
2077                 node = udev_device_get_devnode(q);
2078                 if (!node)
2079                         continue;
2080
2081                 pp = blkid_partlist_devno_to_partition(pl, qn);
2082                 if (!pp)
2083                         continue;
2084
2085                 flags = blkid_partition_get_flags(pp);
2086                 if (flags & GPT_FLAG_NO_AUTO)
2087                         continue;
2088
2089                 nr = blkid_partition_get_partno(pp);
2090                 if (nr < 0)
2091                         continue;
2092
2093                 stype = blkid_partition_get_type_string(pp);
2094                 if (!stype)
2095                         continue;
2096
2097                 if (sd_id128_from_string(stype, &type_id) < 0)
2098                         continue;
2099
2100                 if (sd_id128_equal(type_id, GPT_HOME)) {
2101
2102                         if (home && nr >= home_nr)
2103                                 continue;
2104
2105                         home_nr = nr;
2106                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2107
2108                         free(home);
2109                         home = strdup(node);
2110                         if (!home)
2111                                 return log_oom();
2112                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2113
2114                         if (srv && nr >= srv_nr)
2115                                 continue;
2116
2117                         srv_nr = nr;
2118                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2119
2120                         free(srv);
2121                         srv = strdup(node);
2122                         if (!srv)
2123                                 return log_oom();
2124                 }
2125 #ifdef GPT_ROOT_NATIVE
2126                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2127
2128                         if (root && nr >= root_nr)
2129                                 continue;
2130
2131                         root_nr = nr;
2132                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2133
2134                         free(root);
2135                         root = strdup(node);
2136                         if (!root)
2137                                 return log_oom();
2138                 }
2139 #endif
2140 #ifdef GPT_ROOT_SECONDARY
2141                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2142
2143                         if (secondary_root && nr >= secondary_root_nr)
2144                                 continue;
2145
2146                         secondary_root_nr = nr;
2147                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2148
2149
2150                         free(secondary_root);
2151                         secondary_root = strdup(node);
2152                         if (!secondary_root)
2153                                 return log_oom();
2154                 }
2155 #endif
2156         }
2157
2158         if (!root && !secondary_root) {
2159                 log_error("Failed to identify root partition in disk image %s.\n"
2160                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2161                 return -EINVAL;
2162         }
2163
2164         if (root) {
2165                 *root_device = root;
2166                 root = NULL;
2167
2168                 *root_device_rw = root_rw;
2169                 *secondary = false;
2170         } else if (secondary_root) {
2171                 *root_device = secondary_root;
2172                 secondary_root = NULL;
2173
2174                 *root_device_rw = secondary_root_rw;
2175                 *secondary = true;
2176         }
2177
2178         if (home) {
2179                 *home_device = home;
2180                 home = NULL;
2181
2182                 *home_device_rw = home_rw;
2183         }
2184
2185         if (srv) {
2186                 *srv_device = srv;
2187                 srv = NULL;
2188
2189                 *srv_device_rw = srv_rw;
2190         }
2191
2192         return 0;
2193 #else
2194         log_error("--image= is not supported, compiled without blkid support.");
2195         return -ENOTSUP;
2196 #endif
2197 }
2198
2199 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2200 #ifdef HAVE_BLKID
2201         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2202         const char *fstype, *p;
2203         int r;
2204
2205         assert(what);
2206         assert(where);
2207
2208         if (arg_read_only)
2209                 rw = false;
2210
2211         if (directory)
2212                 p = strappenda(where, directory);
2213         else
2214                 p = where;
2215
2216         errno = 0;
2217         b = blkid_new_probe_from_filename(what);
2218         if (!b) {
2219                 if (errno == 0)
2220                         return log_oom();
2221                 log_error("Failed to allocate prober for %s: %m", what);
2222                 return -errno;
2223         }
2224
2225         blkid_probe_enable_superblocks(b, 1);
2226         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2227
2228         errno = 0;
2229         r = blkid_do_safeprobe(b);
2230         if (r == -1 || r == 1) {
2231                 log_error("Cannot determine file system type of %s", what);
2232                 return -EINVAL;
2233         } else if (r != 0) {
2234                 if (errno == 0)
2235                         errno = EIO;
2236                 log_error("Failed to probe %s: %m", what);
2237                 return -errno;
2238         }
2239
2240         errno = 0;
2241         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2242                 if (errno == 0)
2243                         errno = EINVAL;
2244                 log_error("Failed to determine file system type of %s", what);
2245                 return -errno;
2246         }
2247
2248         if (streq(fstype, "crypto_LUKS")) {
2249                 log_error("nspawn currently does not support LUKS disk images.");
2250                 return -ENOTSUP;
2251         }
2252
2253         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2254                 log_error("Failed to mount %s: %m", what);
2255                 return -errno;
2256         }
2257
2258         return 0;
2259 #else
2260         log_error("--image= is not supported, compiled without blkid support.");
2261         return -ENOTSUP;
2262 #endif
2263 }
2264
2265 static int mount_devices(
2266                 const char *where,
2267                 const char *root_device, bool root_device_rw,
2268                 const char *home_device, bool home_device_rw,
2269                 const char *srv_device, bool srv_device_rw) {
2270         int r;
2271
2272         assert(where);
2273
2274         if (root_device) {
2275                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2276                 if (r < 0) {
2277                         log_error("Failed to mount root directory: %s", strerror(-r));
2278                         return r;
2279                 }
2280         }
2281
2282         if (home_device) {
2283                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2284                 if (r < 0) {
2285                         log_error("Failed to mount home directory: %s", strerror(-r));
2286                         return r;
2287                 }
2288         }
2289
2290         if (srv_device) {
2291                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2292                 if (r < 0) {
2293                         log_error("Failed to mount server data directory: %s", strerror(-r));
2294                         return r;
2295                 }
2296         }
2297
2298         return 0;
2299 }
2300
2301 static void loop_remove(int nr, int *image_fd) {
2302         _cleanup_close_ int control = -1;
2303
2304         if (nr < 0)
2305                 return;
2306
2307         if (image_fd && *image_fd >= 0) {
2308                 ioctl(*image_fd, LOOP_CLR_FD);
2309                 *image_fd = safe_close(*image_fd);
2310         }
2311
2312         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2313         if (control < 0)
2314                 return;
2315
2316         ioctl(control, LOOP_CTL_REMOVE, nr);
2317 }
2318
2319 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2320         int pipe_fds[2];
2321         pid_t pid;
2322
2323         assert(database);
2324         assert(key);
2325         assert(rpid);
2326
2327         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2328                 log_error("Failed to allocate pipe: %m");
2329                 return -errno;
2330         }
2331
2332         pid = fork();
2333         if (pid < 0) {
2334                 log_error("Failed to fork getent child: %m");
2335                 return -errno;
2336         } else if (pid == 0) {
2337                 int nullfd;
2338                 char *empty_env = NULL;
2339
2340                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2341                         _exit(EXIT_FAILURE);
2342
2343                 if (pipe_fds[0] > 2)
2344                         safe_close(pipe_fds[0]);
2345                 if (pipe_fds[1] > 2)
2346                         safe_close(pipe_fds[1]);
2347
2348                 nullfd = open("/dev/null", O_RDWR);
2349                 if (nullfd < 0)
2350                         _exit(EXIT_FAILURE);
2351
2352                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2353                         _exit(EXIT_FAILURE);
2354
2355                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2356                         _exit(EXIT_FAILURE);
2357
2358                 if (nullfd > 2)
2359                         safe_close(nullfd);
2360
2361                 reset_all_signal_handlers();
2362                 close_all_fds(NULL, 0);
2363
2364                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2365                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2366                 _exit(EXIT_FAILURE);
2367         }
2368
2369         pipe_fds[1] = safe_close(pipe_fds[1]);
2370
2371         *rpid = pid;
2372
2373         return pipe_fds[0];
2374 }
2375
2376 static int change_uid_gid(char **_home) {
2377         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2378         _cleanup_free_ uid_t *uids = NULL;
2379         _cleanup_free_ char *home = NULL;
2380         _cleanup_fclose_ FILE *f = NULL;
2381         _cleanup_close_ int fd = -1;
2382         unsigned n_uids = 0;
2383         size_t sz = 0, l;
2384         uid_t uid;
2385         gid_t gid;
2386         pid_t pid;
2387         int r;
2388
2389         assert(_home);
2390
2391         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2392                 /* Reset everything fully to 0, just in case */
2393
2394                 if (setgroups(0, NULL) < 0) {
2395                         log_error("setgroups() failed: %m");
2396                         return -errno;
2397                 }
2398
2399                 if (setresgid(0, 0, 0) < 0) {
2400                         log_error("setregid() failed: %m");
2401                         return -errno;
2402                 }
2403
2404                 if (setresuid(0, 0, 0) < 0) {
2405                         log_error("setreuid() failed: %m");
2406                         return -errno;
2407                 }
2408
2409                 *_home = NULL;
2410                 return 0;
2411         }
2412
2413         /* First, get user credentials */
2414         fd = spawn_getent("passwd", arg_user, &pid);
2415         if (fd < 0)
2416                 return fd;
2417
2418         f = fdopen(fd, "r");
2419         if (!f)
2420                 return log_oom();
2421         fd = -1;
2422
2423         if (!fgets(line, sizeof(line), f)) {
2424
2425                 if (!ferror(f)) {
2426                         log_error("Failed to resolve user %s.", arg_user);
2427                         return -ESRCH;
2428                 }
2429
2430                 log_error("Failed to read from getent: %m");
2431                 return -errno;
2432         }
2433
2434         truncate_nl(line);
2435
2436         wait_for_terminate_and_warn("getent passwd", pid);
2437
2438         x = strchr(line, ':');
2439         if (!x) {
2440                 log_error("/etc/passwd entry has invalid user field.");
2441                 return -EIO;
2442         }
2443
2444         u = strchr(x+1, ':');
2445         if (!u) {
2446                 log_error("/etc/passwd entry has invalid password field.");
2447                 return -EIO;
2448         }
2449
2450         u++;
2451         g = strchr(u, ':');
2452         if (!g) {
2453                 log_error("/etc/passwd entry has invalid UID field.");
2454                 return -EIO;
2455         }
2456
2457         *g = 0;
2458         g++;
2459         x = strchr(g, ':');
2460         if (!x) {
2461                 log_error("/etc/passwd entry has invalid GID field.");
2462                 return -EIO;
2463         }
2464
2465         *x = 0;
2466         h = strchr(x+1, ':');
2467         if (!h) {
2468                 log_error("/etc/passwd entry has invalid GECOS field.");
2469                 return -EIO;
2470         }
2471
2472         h++;
2473         x = strchr(h, ':');
2474         if (!x) {
2475                 log_error("/etc/passwd entry has invalid home directory field.");
2476                 return -EIO;
2477         }
2478
2479         *x = 0;
2480
2481         r = parse_uid(u, &uid);
2482         if (r < 0) {
2483                 log_error("Failed to parse UID of user.");
2484                 return -EIO;
2485         }
2486
2487         r = parse_gid(g, &gid);
2488         if (r < 0) {
2489                 log_error("Failed to parse GID of user.");
2490                 return -EIO;
2491         }
2492
2493         home = strdup(h);
2494         if (!home)
2495                 return log_oom();
2496
2497         /* Second, get group memberships */
2498         fd = spawn_getent("initgroups", arg_user, &pid);
2499         if (fd < 0)
2500                 return fd;
2501
2502         fclose(f);
2503         f = fdopen(fd, "r");
2504         if (!f)
2505                 return log_oom();
2506         fd = -1;
2507
2508         if (!fgets(line, sizeof(line), f)) {
2509                 if (!ferror(f)) {
2510                         log_error("Failed to resolve user %s.", arg_user);
2511                         return -ESRCH;
2512                 }
2513
2514                 log_error("Failed to read from getent: %m");
2515                 return -errno;
2516         }
2517
2518         truncate_nl(line);
2519
2520         wait_for_terminate_and_warn("getent initgroups", pid);
2521
2522         /* Skip over the username and subsequent separator whitespace */
2523         x = line;
2524         x += strcspn(x, WHITESPACE);
2525         x += strspn(x, WHITESPACE);
2526
2527         FOREACH_WORD(w, l, x, state) {
2528                 char c[l+1];
2529
2530                 memcpy(c, w, l);
2531                 c[l] = 0;
2532
2533                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2534                         return log_oom();
2535
2536                 r = parse_uid(c, &uids[n_uids++]);
2537                 if (r < 0) {
2538                         log_error("Failed to parse group data from getent.");
2539                         return -EIO;
2540                 }
2541         }
2542
2543         r = mkdir_parents(home, 0775);
2544         if (r < 0) {
2545                 log_error("Failed to make home root directory: %s", strerror(-r));
2546                 return r;
2547         }
2548
2549         r = mkdir_safe(home, 0755, uid, gid);
2550         if (r < 0 && r != -EEXIST) {
2551                 log_error("Failed to make home directory: %s", strerror(-r));
2552                 return r;
2553         }
2554
2555         fchown(STDIN_FILENO, uid, gid);
2556         fchown(STDOUT_FILENO, uid, gid);
2557         fchown(STDERR_FILENO, uid, gid);
2558
2559         if (setgroups(n_uids, uids) < 0) {
2560                 log_error("Failed to set auxiliary groups: %m");
2561                 return -errno;
2562         }
2563
2564         if (setresgid(gid, gid, gid) < 0) {
2565                 log_error("setregid() failed: %m");
2566                 return -errno;
2567         }
2568
2569         if (setresuid(uid, uid, uid) < 0) {
2570                 log_error("setreuid() failed: %m");
2571                 return -errno;
2572         }
2573
2574         if (_home) {
2575                 *_home = home;
2576                 home = NULL;
2577         }
2578
2579         return 0;
2580 }
2581
2582 /*
2583  * Return 0 in case the container is being rebooted, has been shut
2584  * down or exited successfully. On failures a negative value is
2585  * returned.
2586  *
2587  * The status of the container "CONTAINER_TERMINATED" or
2588  * "CONTAINER_REBOOTED" will be saved in the container argument
2589  */
2590 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2591         int r;
2592         siginfo_t status;
2593
2594         r = wait_for_terminate(pid, &status);
2595         if (r < 0)
2596                 return r;
2597
2598         switch (status.si_code) {
2599         case CLD_EXITED:
2600                 r = status.si_status;
2601                 if (r == 0) {
2602                         if (!arg_quiet)
2603                                 log_debug("Container %s exited successfully.",
2604                                           arg_machine);
2605
2606                         *container = CONTAINER_TERMINATED;
2607                 } else {
2608                         log_error("Container %s failed with error code %i.",
2609                                   arg_machine, status.si_status);
2610                         r = -1;
2611                 }
2612                 break;
2613
2614         case CLD_KILLED:
2615                 if (status.si_status == SIGINT) {
2616                         if (!arg_quiet)
2617                                 log_info("Container %s has been shut down.",
2618                                          arg_machine);
2619
2620                         *container = CONTAINER_TERMINATED;
2621                         r = 0;
2622                         break;
2623                 } else if (status.si_status == SIGHUP) {
2624                         if (!arg_quiet)
2625                                 log_info("Container %s is being rebooted.",
2626                                          arg_machine);
2627
2628                         *container = CONTAINER_REBOOTED;
2629                         r = 0;
2630                         break;
2631                 }
2632                 /* CLD_KILLED fallthrough */
2633
2634         case CLD_DUMPED:
2635                 log_error("Container %s terminated by signal %s.",
2636                           arg_machine, signal_to_string(status.si_status));
2637                 r = -1;
2638                 break;
2639
2640         default:
2641                 log_error("Container %s failed due to unknown reason.",
2642                           arg_machine);
2643                 r = -1;
2644                 break;
2645         }
2646
2647         return r;
2648 }
2649
2650 static void nop_handler(int sig) {}
2651
2652 int main(int argc, char *argv[]) {
2653
2654         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2655         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2656         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2657         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2658         _cleanup_fdset_free_ FDSet *fds = NULL;
2659         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2660         const char *console = NULL;
2661         char veth_name[IFNAMSIZ];
2662         bool secondary = false;
2663         sigset_t mask, mask_chld;
2664         pid_t pid = 0;
2665
2666         log_parse_environment();
2667         log_open();
2668
2669         k = parse_argv(argc, argv);
2670         if (k < 0)
2671                 goto finish;
2672         else if (k == 0) {
2673                 r = EXIT_SUCCESS;
2674                 goto finish;
2675         }
2676
2677         if (!arg_image) {
2678                 if (arg_directory) {
2679                         char *p;
2680
2681                         p = path_make_absolute_cwd(arg_directory);
2682                         free(arg_directory);
2683                         arg_directory = p;
2684                 } else
2685                         arg_directory = get_current_dir_name();
2686
2687                 if (!arg_directory) {
2688                         log_error("Failed to determine path, please use -D.");
2689                         goto finish;
2690                 }
2691                 path_kill_slashes(arg_directory);
2692         }
2693
2694         if (!arg_machine) {
2695                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2696                 if (!arg_machine) {
2697                         log_oom();
2698                         goto finish;
2699                 }
2700
2701                 hostname_cleanup(arg_machine, false);
2702                 if (isempty(arg_machine)) {
2703                         log_error("Failed to determine machine name automatically, please use -M.");
2704                         goto finish;
2705                 }
2706         }
2707
2708         if (geteuid() != 0) {
2709                 log_error("Need to be root.");
2710                 goto finish;
2711         }
2712
2713         if (sd_booted() <= 0) {
2714                 log_error("Not running on a systemd system.");
2715                 goto finish;
2716         }
2717
2718         log_close();
2719         n_fd_passed = sd_listen_fds(false);
2720         if (n_fd_passed > 0) {
2721                 k = fdset_new_listen_fds(&fds, false);
2722                 if (k < 0) {
2723                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2724                         goto finish;
2725                 }
2726         }
2727         fdset_close_others(fds);
2728         log_open();
2729
2730         if (arg_directory) {
2731                 if (path_equal(arg_directory, "/")) {
2732                         log_error("Spawning container on root directory not supported.");
2733                         goto finish;
2734                 }
2735
2736                 if (arg_boot) {
2737                         if (path_is_os_tree(arg_directory) <= 0) {
2738                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2739                                 goto finish;
2740                         }
2741                 } else {
2742                         const char *p;
2743
2744                         p = strappenda(arg_directory,
2745                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2746                         if (access(p, F_OK) < 0) {
2747                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2748                                 goto finish;
2749
2750                         }
2751                 }
2752         } else {
2753                 char template[] = "/tmp/nspawn-root-XXXXXX";
2754
2755                 if (!mkdtemp(template)) {
2756                         log_error("Failed to create temporary directory: %m");
2757                         r = -errno;
2758                         goto finish;
2759                 }
2760
2761                 arg_directory = strdup(template);
2762                 if (!arg_directory) {
2763                         r = log_oom();
2764                         goto finish;
2765                 }
2766
2767                 image_fd = setup_image(&device_path, &loop_nr);
2768                 if (image_fd < 0) {
2769                         r = image_fd;
2770                         goto finish;
2771                 }
2772
2773                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2774                 if (r < 0)
2775                         goto finish;
2776         }
2777
2778         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2779         if (master < 0) {
2780                 log_error("Failed to acquire pseudo tty: %m");
2781                 goto finish;
2782         }
2783
2784         console = ptsname(master);
2785         if (!console) {
2786                 log_error("Failed to determine tty name: %m");
2787                 goto finish;
2788         }
2789
2790         if (!arg_quiet)
2791                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2792                          arg_machine, arg_image ? arg_image : arg_directory);
2793
2794         if (unlockpt(master) < 0) {
2795                 log_error("Failed to unlock tty: %m");
2796                 goto finish;
2797         }
2798
2799         if (access("/dev/kdbus/control", F_OK) >= 0) {
2800
2801                 if (arg_share_system) {
2802                         kdbus_domain = strdup("/dev/kdbus");
2803                         if (!kdbus_domain) {
2804                                 log_oom();
2805                                 goto finish;
2806                         }
2807                 } else {
2808                         const char *ns;
2809
2810                         ns = strappenda("machine-", arg_machine);
2811                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2812                         if (r < 0)
2813                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2814                         else
2815                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2816                 }
2817         }
2818
2819         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2820                 log_error("Failed to create kmsg socket pair: %m");
2821                 goto finish;
2822         }
2823
2824         sd_notify(0, "READY=1");
2825
2826         assert_se(sigemptyset(&mask) == 0);
2827         assert_se(sigemptyset(&mask_chld) == 0);
2828         sigaddset(&mask_chld, SIGCHLD);
2829         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2830         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2831
2832         for (;;) {
2833                 ContainerStatus container_status;
2834                 int eventfds[2] = { -1, -1 };
2835                 struct sigaction sa = {
2836                         .sa_handler = nop_handler,
2837                         .sa_flags = SA_NOCLDSTOP,
2838                 };
2839
2840                 /* Child can be killed before execv(), so handle SIGCHLD
2841                  * in order to interrupt parent's blocking calls and
2842                  * give it a chance to call wait() and terminate. */
2843                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2844                 if (r < 0) {
2845                         log_error("Failed to change the signal mask: %m");
2846                         goto finish;
2847                 }
2848
2849                 r = sigaction(SIGCHLD, &sa, NULL);
2850                 if (r < 0) {
2851                         log_error("Failed to install SIGCHLD handler: %m");
2852                         goto finish;
2853                 }
2854
2855                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2856                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2857                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2858                 if (pid < 0) {
2859                         if (errno == EINVAL)
2860                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2861                         else
2862                                 log_error("clone() failed: %m");
2863
2864                         r = pid;
2865                         goto finish;
2866                 }
2867
2868                 if (pid == 0) {
2869                         /* child */
2870                         _cleanup_free_ char *home = NULL;
2871                         unsigned n_env = 2;
2872                         const char *envp[] = {
2873                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2874                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2875                                 NULL, /* TERM */
2876                                 NULL, /* HOME */
2877                                 NULL, /* USER */
2878                                 NULL, /* LOGNAME */
2879                                 NULL, /* container_uuid */
2880                                 NULL, /* LISTEN_FDS */
2881                                 NULL, /* LISTEN_PID */
2882                                 NULL
2883                         };
2884                         char **env_use;
2885
2886                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2887                         if (envp[n_env])
2888                                 n_env ++;
2889
2890                         master = safe_close(master);
2891
2892                         close_nointr(STDIN_FILENO);
2893                         close_nointr(STDOUT_FILENO);
2894                         close_nointr(STDERR_FILENO);
2895
2896                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2897
2898                         reset_all_signal_handlers();
2899
2900                         assert_se(sigemptyset(&mask) == 0);
2901                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2902
2903                         k = open_terminal(console, O_RDWR);
2904                         if (k != STDIN_FILENO) {
2905                                 if (k >= 0) {
2906                                         safe_close(k);
2907                                         k = -EINVAL;
2908                                 }
2909
2910                                 log_error("Failed to open console: %s", strerror(-k));
2911                                 goto child_fail;
2912                         }
2913
2914                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2915                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2916                                 log_error("Failed to duplicate console: %m");
2917                                 goto child_fail;
2918                         }
2919
2920                         if (setsid() < 0) {
2921                                 log_error("setsid() failed: %m");
2922                                 goto child_fail;
2923                         }
2924
2925                         if (reset_audit_loginuid() < 0)
2926                                 goto child_fail;
2927
2928                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2929                                 log_error("PR_SET_PDEATHSIG failed: %m");
2930                                 goto child_fail;
2931                         }
2932
2933                         /* Mark everything as slave, so that we still
2934                          * receive mounts from the real root, but don't
2935                          * propagate mounts to the real root. */
2936                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2937                                 log_error("MS_SLAVE|MS_REC failed: %m");
2938                                 goto child_fail;
2939                         }
2940
2941                         if (mount_devices(arg_directory,
2942                                           root_device, root_device_rw,
2943                                           home_device, home_device_rw,
2944                                           srv_device, srv_device_rw) < 0)
2945                                 goto child_fail;
2946
2947                         /* Turn directory into bind mount */
2948                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2949                                 log_error("Failed to make bind mount: %m");
2950                                 goto child_fail;
2951                         }
2952
2953                         if (arg_read_only) {
2954                                 k = bind_remount_recursive(arg_directory, true);
2955                                 if (k < 0) {
2956                                         log_error("Failed to make tree read-only: %s", strerror(-k));
2957                                         goto child_fail;
2958                                 }
2959                         }
2960
2961                         if (mount_all(arg_directory) < 0)
2962                                 goto child_fail;
2963
2964                         if (copy_devnodes(arg_directory) < 0)
2965                                 goto child_fail;
2966
2967                         if (setup_ptmx(arg_directory) < 0)
2968                                 goto child_fail;
2969
2970                         dev_setup(arg_directory);
2971
2972                         if (audit_still_doesnt_work_in_containers() < 0)
2973                                 goto child_fail;
2974
2975                         if (setup_dev_console(arg_directory, console) < 0)
2976                                 goto child_fail;
2977
2978                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2979                                 goto child_fail;
2980
2981                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2982
2983                         if (setup_boot_id(arg_directory) < 0)
2984                                 goto child_fail;
2985
2986                         if (setup_timezone(arg_directory) < 0)
2987                                 goto child_fail;
2988
2989                         if (setup_resolv_conf(arg_directory) < 0)
2990                                 goto child_fail;
2991
2992                         if (setup_journal(arg_directory) < 0)
2993                                 goto child_fail;
2994
2995                         if (mount_binds(arg_directory, arg_bind, false) < 0)
2996                                 goto child_fail;
2997
2998                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
2999                                 goto child_fail;
3000
3001                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3002                                 goto child_fail;
3003
3004                         /* Tell the parent that we are ready, and that
3005                          * it can cgroupify us to that we lack access
3006                          * to certain devices and resources. */
3007                         r = eventfd_send_state(eventfds[1],
3008                                                EVENTFD_CHILD_SUCCEEDED);
3009                         eventfds[1] = safe_close(eventfds[1]);
3010                         if (r < 0)
3011                                 goto child_fail;
3012
3013                         if (chdir(arg_directory) < 0) {
3014                                 log_error("chdir(%s) failed: %m", arg_directory);
3015                                 goto child_fail;
3016                         }
3017
3018                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3019                                 log_error("mount(MS_MOVE) failed: %m");
3020                                 goto child_fail;
3021                         }
3022
3023                         if (chroot(".") < 0) {
3024                                 log_error("chroot() failed: %m");
3025                                 goto child_fail;
3026                         }
3027
3028                         if (chdir("/") < 0) {
3029                                 log_error("chdir() failed: %m");
3030                                 goto child_fail;
3031                         }
3032
3033                         umask(0022);
3034
3035                         if (arg_private_network)
3036                                 loopback_setup();
3037
3038                         if (drop_capabilities() < 0) {
3039                                 log_error("drop_capabilities() failed: %m");
3040                                 goto child_fail;
3041                         }
3042
3043                         r = change_uid_gid(&home);
3044                         if (r < 0)
3045                                 goto child_fail;
3046
3047                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3048                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3049                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3050                                 log_oom();
3051                                 goto child_fail;
3052                         }
3053
3054                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3055                                 char as_uuid[37];
3056
3057                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3058                                         log_oom();
3059                                         goto child_fail;
3060                                 }
3061                         }
3062
3063                         if (fdset_size(fds) > 0) {
3064                                 k = fdset_cloexec(fds, false);
3065                                 if (k < 0) {
3066                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3067                                         goto child_fail;
3068                                 }
3069
3070                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3071                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3072                                         log_oom();
3073                                         goto child_fail;
3074                                 }
3075                         }
3076
3077                         setup_hostname();
3078
3079                         if (arg_personality != 0xffffffffLU) {
3080                                 if (personality(arg_personality) < 0) {
3081                                         log_error("personality() failed: %m");
3082                                         goto child_fail;
3083                                 }
3084                         } else if (secondary) {
3085                                 if (personality(PER_LINUX32) < 0) {
3086                                         log_error("personality() failed: %m");
3087                                         goto child_fail;
3088                                 }
3089                         }
3090
3091 #ifdef HAVE_SELINUX
3092                         if (arg_selinux_context)
3093                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3094                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3095                                         goto child_fail;
3096                                 }
3097 #endif
3098
3099                         if (!strv_isempty(arg_setenv)) {
3100                                 char **n;
3101
3102                                 n = strv_env_merge(2, envp, arg_setenv);
3103                                 if (!n) {
3104                                         log_oom();
3105                                         goto child_fail;
3106                                 }
3107
3108                                 env_use = n;
3109                         } else
3110                                 env_use = (char**) envp;
3111
3112                         /* Wait until the parent is ready with the setup, too... */
3113                         r = eventfd_parent_succeeded(eventfds[0]);
3114                         eventfds[0] = safe_close(eventfds[0]);
3115                         if (r < 0)
3116                                 goto child_fail;
3117
3118                         if (arg_boot) {
3119                                 char **a;
3120                                 size_t l;
3121
3122                                 /* Automatically search for the init system */
3123
3124                                 l = 1 + argc - optind;
3125                                 a = newa(char*, l + 1);
3126                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3127
3128                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3129                                 execve(a[0], a, env_use);
3130
3131                                 a[0] = (char*) "/lib/systemd/systemd";
3132                                 execve(a[0], a, env_use);
3133
3134                                 a[0] = (char*) "/sbin/init";
3135                                 execve(a[0], a, env_use);
3136                         } else if (argc > optind)
3137                                 execvpe(argv[optind], argv + optind, env_use);
3138                         else {
3139                                 chdir(home ? home : "/root");
3140                                 execle("/bin/bash", "-bash", NULL, env_use);
3141                                 execle("/bin/sh", "-sh", NULL, env_use);
3142                         }
3143
3144                         log_error("execv() failed: %m");
3145
3146                 child_fail:
3147                         /* Tell the parent that the setup failed, so he
3148                          * can clean up resources and terminate. */
3149                         if (eventfds[1] != -1)
3150                                 eventfd_send_state(eventfds[1],
3151                                                    EVENTFD_CHILD_FAILED);
3152                         _exit(EXIT_FAILURE);
3153                 }
3154
3155                 fdset_free(fds);
3156                 fds = NULL;
3157
3158                 /* Wait for the child event:
3159                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3160                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3161                  * it is ready with all it needs to do with priviliges.
3162                  * After we got the notification we can make the process
3163                  * join its cgroup which might limit what it can do */
3164                 r = eventfd_child_succeeded(eventfds[1]);
3165                 eventfds[1] = safe_close(eventfds[1]);
3166                 if (r < 0)
3167                         goto check_container_status;
3168
3169                 r = register_machine(pid);
3170                 if (r < 0)
3171                         goto finish;
3172
3173                 r = move_network_interfaces(pid);
3174                 if (r < 0)
3175                         goto finish;
3176
3177                 r = setup_veth(pid, veth_name);
3178                 if (r < 0)
3179                         goto finish;
3180
3181                 r = setup_bridge(veth_name);
3182                 if (r < 0)
3183                         goto finish;
3184
3185                 r = setup_macvlan(pid);
3186                 if (r < 0)
3187                         goto finish;
3188
3189                 /* Block SIGCHLD here, before notifying child.
3190                  * process_pty() will handle it with the other signals. */
3191                 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3192                 if (r < 0)
3193                         goto finish;
3194
3195                 /* Reset signal to default */
3196                 r = default_signals(SIGCHLD, -1);
3197                 if (r < 0)
3198                         goto finish;
3199
3200                 /* Notify the child that the parent is ready with all
3201                  * its setup, and that the child can now hand over
3202                  * control to the code to run inside the container. */
3203                 r = eventfd_send_state(eventfds[0],
3204                                        EVENTFD_PARENT_SUCCEEDED);
3205                 eventfds[0] = safe_close(eventfds[0]);
3206                 if (r < 0)
3207                         goto finish;
3208
3209                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3210                 if (k < 0) {
3211                         r = EXIT_FAILURE;
3212                         break;
3213                 }
3214
3215                 if (!arg_quiet)
3216                         putc('\n', stdout);
3217
3218                 /* Kill if it is not dead yet anyway */
3219                 terminate_machine(pid);
3220
3221 check_container_status:
3222                 /* Redundant, but better safe than sorry */
3223                 kill(pid, SIGKILL);
3224
3225                 r = wait_for_container(pid, &container_status);
3226                 pid = 0;
3227
3228                 if (r < 0) {
3229                         r = EXIT_FAILURE;
3230                         break;
3231                 } else if (container_status == CONTAINER_TERMINATED)
3232                         break;
3233
3234                 /* CONTAINER_REBOOTED, loop again */
3235         }
3236
3237 finish:
3238         loop_remove(loop_nr, &image_fd);
3239
3240         if (pid > 0)
3241                 kill(pid, SIGKILL);
3242
3243         free(arg_directory);
3244         free(arg_machine);
3245         free(arg_user);
3246         strv_free(arg_setenv);
3247         strv_free(arg_network_interfaces);
3248         strv_free(arg_network_macvlan);
3249         strv_free(arg_bind);
3250         strv_free(arg_bind_ro);
3251
3252         return r;
3253 }