chiark / gitweb /
nspawn: make nspawn robust to container failure
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91
92 #ifdef HAVE_SECCOMP
93 #include "seccomp-util.h"
94 #endif
95
96 typedef enum ContainerStatus {
97         CONTAINER_TERMINATED,
98         CONTAINER_REBOOTED
99 } ContainerStatus;
100
101 typedef enum LinkJournal {
102         LINK_NO,
103         LINK_AUTO,
104         LINK_HOST,
105         LINK_GUEST
106 } LinkJournal;
107
108 static char *arg_directory = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static uint64_t arg_retain =
120         (1ULL << CAP_CHOWN) |
121         (1ULL << CAP_DAC_OVERRIDE) |
122         (1ULL << CAP_DAC_READ_SEARCH) |
123         (1ULL << CAP_FOWNER) |
124         (1ULL << CAP_FSETID) |
125         (1ULL << CAP_IPC_OWNER) |
126         (1ULL << CAP_KILL) |
127         (1ULL << CAP_LEASE) |
128         (1ULL << CAP_LINUX_IMMUTABLE) |
129         (1ULL << CAP_NET_BIND_SERVICE) |
130         (1ULL << CAP_NET_BROADCAST) |
131         (1ULL << CAP_NET_RAW) |
132         (1ULL << CAP_SETGID) |
133         (1ULL << CAP_SETFCAP) |
134         (1ULL << CAP_SETPCAP) |
135         (1ULL << CAP_SETUID) |
136         (1ULL << CAP_SYS_ADMIN) |
137         (1ULL << CAP_SYS_CHROOT) |
138         (1ULL << CAP_SYS_NICE) |
139         (1ULL << CAP_SYS_PTRACE) |
140         (1ULL << CAP_SYS_TTY_CONFIG) |
141         (1ULL << CAP_SYS_RESOURCE) |
142         (1ULL << CAP_SYS_BOOT) |
143         (1ULL << CAP_AUDIT_WRITE) |
144         (1ULL << CAP_AUDIT_CONTROL) |
145         (1ULL << CAP_MKNOD);
146 static char **arg_bind = NULL;
147 static char **arg_bind_ro = NULL;
148 static char **arg_setenv = NULL;
149 static bool arg_quiet = false;
150 static bool arg_share_system = false;
151 static bool arg_register = true;
152 static bool arg_keep_unit = false;
153 static char **arg_network_interfaces = NULL;
154 static char **arg_network_macvlan = NULL;
155 static bool arg_network_veth = false;
156 static const char *arg_network_bridge = NULL;
157 static unsigned long arg_personality = 0xffffffffLU;
158 static const char *arg_image = NULL;
159
160 static int help(void) {
161
162         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
163                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
164                "  -h --help                 Show this help\n"
165                "     --version              Print version string\n"
166                "  -q --quiet                Do not show status information\n"
167                "  -D --directory=PATH       Root directory for the container\n"
168                "  -i --image=PATH           File system device or image for the container\n"
169                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
170                "  -u --user=USER            Run the command under specified user or uid\n"
171                "  -M --machine=NAME         Set the machine name for the container\n"
172                "     --uuid=UUID            Set a specific machine UUID for the container\n"
173                "  -S --slice=SLICE          Place the container in the specified slice\n"
174                "     --private-network      Disable network in container\n"
175                "     --network-interface=INTERFACE\n"
176                "                            Assign an existing network interface to the\n"
177                "                            container\n"
178                "     --network-macvlan=INTERFACE\n"
179                "                            Create a macvlan network interface based on an\n"
180                "                            existing network interface to the container\n"
181                "     --network-veth         Add a virtual ethernet connection between host\n"
182                "                            and container\n"
183                "     --network-bridge=INTERFACE\n"
184                "                            Add a virtual ethernet connection between host\n"
185                "                            and container and add it to an existing bridge on\n"
186                "                            the host\n"
187                "  -Z --selinux-context=SECLABEL\n"
188                "                            Set the SELinux security context to be used by\n"
189                "                            processes in the container\n"
190                "  -L --selinux-apifs-context=SECLABEL\n"
191                "                            Set the SELinux security context to be used by\n"
192                "                            API/tmpfs file systems in the container\n"
193                "     --capability=CAP       In addition to the default, retain specified\n"
194                "                            capability\n"
195                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
196                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
197                "  -j                        Equivalent to --link-journal=host\n"
198                "     --read-only            Mount the root directory read-only\n"
199                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
200                "                            the container\n"
201                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
202                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
203                "     --share-system         Share system namespaces with host\n"
204                "     --register=BOOLEAN     Register container as machine\n"
205                "     --keep-unit            Do not register a scope for the machine, reuse\n"
206                "                            the service unit nspawn is running in\n",
207                program_invocation_short_name);
208
209         return 0;
210 }
211
212 static int parse_argv(int argc, char *argv[]) {
213
214         enum {
215                 ARG_VERSION = 0x100,
216                 ARG_PRIVATE_NETWORK,
217                 ARG_UUID,
218                 ARG_READ_ONLY,
219                 ARG_CAPABILITY,
220                 ARG_DROP_CAPABILITY,
221                 ARG_LINK_JOURNAL,
222                 ARG_BIND,
223                 ARG_BIND_RO,
224                 ARG_SETENV,
225                 ARG_SHARE_SYSTEM,
226                 ARG_REGISTER,
227                 ARG_KEEP_UNIT,
228                 ARG_NETWORK_INTERFACE,
229                 ARG_NETWORK_MACVLAN,
230                 ARG_NETWORK_VETH,
231                 ARG_NETWORK_BRIDGE,
232                 ARG_PERSONALITY,
233         };
234
235         static const struct option options[] = {
236                 { "help",                  no_argument,       NULL, 'h'                   },
237                 { "version",               no_argument,       NULL, ARG_VERSION           },
238                 { "directory",             required_argument, NULL, 'D'                   },
239                 { "user",                  required_argument, NULL, 'u'                   },
240                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
241                 { "boot",                  no_argument,       NULL, 'b'                   },
242                 { "uuid",                  required_argument, NULL, ARG_UUID              },
243                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
244                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
245                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
246                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
247                 { "bind",                  required_argument, NULL, ARG_BIND              },
248                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
249                 { "machine",               required_argument, NULL, 'M'                   },
250                 { "slice",                 required_argument, NULL, 'S'                   },
251                 { "setenv",                required_argument, NULL, ARG_SETENV            },
252                 { "selinux-context",       required_argument, NULL, 'Z'                   },
253                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
254                 { "quiet",                 no_argument,       NULL, 'q'                   },
255                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
256                 { "register",              required_argument, NULL, ARG_REGISTER          },
257                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
258                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
259                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
260                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
261                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
262                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
263                 { "image",                 required_argument, NULL, 'i'                   },
264                 {}
265         };
266
267         int c, r;
268         uint64_t plus = 0, minus = 0;
269
270         assert(argc >= 0);
271         assert(argv);
272
273         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
274
275                 switch (c) {
276
277                 case 'h':
278                         return help();
279
280                 case ARG_VERSION:
281                         puts(PACKAGE_STRING);
282                         puts(SYSTEMD_FEATURES);
283                         return 0;
284
285                 case 'D':
286                         free(arg_directory);
287                         arg_directory = canonicalize_file_name(optarg);
288                         if (!arg_directory) {
289                                 log_error("Invalid root directory: %m");
290                                 return -ENOMEM;
291                         }
292
293                         break;
294
295                 case 'i':
296                         arg_image = optarg;
297                         break;
298
299                 case 'u':
300                         free(arg_user);
301                         arg_user = strdup(optarg);
302                         if (!arg_user)
303                                 return log_oom();
304
305                         break;
306
307                 case ARG_NETWORK_BRIDGE:
308                         arg_network_bridge = optarg;
309
310                         /* fall through */
311
312                 case ARG_NETWORK_VETH:
313                         arg_network_veth = true;
314                         arg_private_network = true;
315                         break;
316
317                 case ARG_NETWORK_INTERFACE:
318                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
319                                 return log_oom();
320
321                         arg_private_network = true;
322                         break;
323
324                 case ARG_NETWORK_MACVLAN:
325                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
326                                 return log_oom();
327
328                         /* fall through */
329
330                 case ARG_PRIVATE_NETWORK:
331                         arg_private_network = true;
332                         break;
333
334                 case 'b':
335                         arg_boot = true;
336                         break;
337
338                 case ARG_UUID:
339                         r = sd_id128_from_string(optarg, &arg_uuid);
340                         if (r < 0) {
341                                 log_error("Invalid UUID: %s", optarg);
342                                 return r;
343                         }
344                         break;
345
346                 case 'S':
347                         arg_slice = optarg;
348                         break;
349
350                 case 'M':
351                         if (isempty(optarg)) {
352                                 free(arg_machine);
353                                 arg_machine = NULL;
354                         } else {
355
356                                 if (!hostname_is_valid(optarg)) {
357                                         log_error("Invalid machine name: %s", optarg);
358                                         return -EINVAL;
359                                 }
360
361                                 free(arg_machine);
362                                 arg_machine = strdup(optarg);
363                                 if (!arg_machine)
364                                         return log_oom();
365
366                                 break;
367                         }
368
369                 case 'Z':
370                         arg_selinux_context = optarg;
371                         break;
372
373                 case 'L':
374                         arg_selinux_apifs_context = optarg;
375                         break;
376
377                 case ARG_READ_ONLY:
378                         arg_read_only = true;
379                         break;
380
381                 case ARG_CAPABILITY:
382                 case ARG_DROP_CAPABILITY: {
383                         char *state, *word;
384                         size_t length;
385
386                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
387                                 _cleanup_free_ char *t;
388                                 cap_value_t cap;
389
390                                 t = strndup(word, length);
391                                 if (!t)
392                                         return log_oom();
393
394                                 if (streq(t, "all")) {
395                                         if (c == ARG_CAPABILITY)
396                                                 plus = (uint64_t) -1;
397                                         else
398                                                 minus = (uint64_t) -1;
399                                 } else {
400                                         if (cap_from_name(t, &cap) < 0) {
401                                                 log_error("Failed to parse capability %s.", t);
402                                                 return -EINVAL;
403                                         }
404
405                                         if (c == ARG_CAPABILITY)
406                                                 plus |= 1ULL << (uint64_t) cap;
407                                         else
408                                                 minus |= 1ULL << (uint64_t) cap;
409                                 }
410                         }
411
412                         break;
413                 }
414
415                 case 'j':
416                         arg_link_journal = LINK_GUEST;
417                         break;
418
419                 case ARG_LINK_JOURNAL:
420                         if (streq(optarg, "auto"))
421                                 arg_link_journal = LINK_AUTO;
422                         else if (streq(optarg, "no"))
423                                 arg_link_journal = LINK_NO;
424                         else if (streq(optarg, "guest"))
425                                 arg_link_journal = LINK_GUEST;
426                         else if (streq(optarg, "host"))
427                                 arg_link_journal = LINK_HOST;
428                         else {
429                                 log_error("Failed to parse link journal mode %s", optarg);
430                                 return -EINVAL;
431                         }
432
433                         break;
434
435                 case ARG_BIND:
436                 case ARG_BIND_RO: {
437                         _cleanup_free_ char *a = NULL, *b = NULL;
438                         char *e;
439                         char ***x;
440
441                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
442
443                         e = strchr(optarg, ':');
444                         if (e) {
445                                 a = strndup(optarg, e - optarg);
446                                 b = strdup(e + 1);
447                         } else {
448                                 a = strdup(optarg);
449                                 b = strdup(optarg);
450                         }
451
452                         if (!a || !b)
453                                 return log_oom();
454
455                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
456                                 log_error("Invalid bind mount specification: %s", optarg);
457                                 return -EINVAL;
458                         }
459
460                         r = strv_extend(x, a);
461                         if (r < 0)
462                                 return log_oom();
463
464                         r = strv_extend(x, b);
465                         if (r < 0)
466                                 return log_oom();
467
468                         break;
469                 }
470
471                 case ARG_SETENV: {
472                         char **n;
473
474                         if (!env_assignment_is_valid(optarg)) {
475                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
476                                 return -EINVAL;
477                         }
478
479                         n = strv_env_set(arg_setenv, optarg);
480                         if (!n)
481                                 return log_oom();
482
483                         strv_free(arg_setenv);
484                         arg_setenv = n;
485                         break;
486                 }
487
488                 case 'q':
489                         arg_quiet = true;
490                         break;
491
492                 case ARG_SHARE_SYSTEM:
493                         arg_share_system = true;
494                         break;
495
496                 case ARG_REGISTER:
497                         r = parse_boolean(optarg);
498                         if (r < 0) {
499                                 log_error("Failed to parse --register= argument: %s", optarg);
500                                 return r;
501                         }
502
503                         arg_register = r;
504                         break;
505
506                 case ARG_KEEP_UNIT:
507                         arg_keep_unit = true;
508                         break;
509
510                 case ARG_PERSONALITY:
511
512                         arg_personality = personality_from_string(optarg);
513                         if (arg_personality == 0xffffffffLU) {
514                                 log_error("Unknown or unsupported personality '%s'.", optarg);
515                                 return -EINVAL;
516                         }
517
518                         break;
519
520                 case '?':
521                         return -EINVAL;
522
523                 default:
524                         assert_not_reached("Unhandled option");
525                 }
526         }
527
528         if (arg_share_system)
529                 arg_register = false;
530
531         if (arg_boot && arg_share_system) {
532                 log_error("--boot and --share-system may not be combined.");
533                 return -EINVAL;
534         }
535
536         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
537                 log_error("--keep-unit may not be used when invoked from a user session.");
538                 return -EINVAL;
539         }
540
541         if (arg_directory && arg_image) {
542                 log_error("--directory= and --image= may not be combined.");
543                 return -EINVAL;
544         }
545
546         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
547
548         return 1;
549 }
550
551 static int mount_all(const char *dest) {
552
553         typedef struct MountPoint {
554                 const char *what;
555                 const char *where;
556                 const char *type;
557                 const char *options;
558                 unsigned long flags;
559                 bool fatal;
560         } MountPoint;
561
562         static const MountPoint mount_table[] = {
563                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
564                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
565                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
566                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
567                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
568                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
569                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
570                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
571 #ifdef HAVE_SELINUX
572                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
573                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
574 #endif
575         };
576
577         unsigned k;
578         int r = 0;
579
580         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
581                 _cleanup_free_ char *where = NULL;
582 #ifdef HAVE_SELINUX
583                 _cleanup_free_ char *options = NULL;
584 #endif
585                 const char *o;
586                 int t;
587
588                 where = strjoin(dest, "/", mount_table[k].where, NULL);
589                 if (!where)
590                         return log_oom();
591
592                 t = path_is_mount_point(where, true);
593                 if (t < 0) {
594                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
595
596                         if (r == 0)
597                                 r = t;
598
599                         continue;
600                 }
601
602                 /* Skip this entry if it is not a remount. */
603                 if (mount_table[k].what && t > 0)
604                         continue;
605
606                 mkdir_p(where, 0755);
607
608 #ifdef HAVE_SELINUX
609                 if (arg_selinux_apifs_context &&
610                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
611                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
612                         if (!options)
613                                 return log_oom();
614
615                         o = options;
616                 } else
617 #endif
618                         o = mount_table[k].options;
619
620
621                 if (mount(mount_table[k].what,
622                           where,
623                           mount_table[k].type,
624                           mount_table[k].flags,
625                           o) < 0 &&
626                     mount_table[k].fatal) {
627
628                         log_error("mount(%s) failed: %m", where);
629
630                         if (r == 0)
631                                 r = -errno;
632                 }
633         }
634
635         return r;
636 }
637
638 static int mount_binds(const char *dest, char **l, unsigned long flags) {
639         char **x, **y;
640
641         STRV_FOREACH_PAIR(x, y, l) {
642                 char *where;
643                 struct stat source_st, dest_st;
644                 int r;
645
646                 if (stat(*x, &source_st) < 0) {
647                         log_error("Failed to stat %s: %m", *x);
648                         return -errno;
649                 }
650
651                 where = strappenda(dest, *y);
652                 r = stat(where, &dest_st);
653                 if (r == 0) {
654                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
655                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
656                                                 *x, where);
657                                 return -EINVAL;
658                         }
659                 } else if (errno == ENOENT) {
660                         r = mkdir_parents_label(where, 0755);
661                         if (r < 0) {
662                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
663                                 return r;
664                         }
665                 } else {
666                         log_error("Failed to bind mount %s: %m", *x);
667                         return -errno;
668                 }
669                 /* Create the mount point, but be conservative -- refuse to create block
670                 * and char devices. */
671                 if (S_ISDIR(source_st.st_mode))
672                         mkdir_label(where, 0755);
673                 else if (S_ISFIFO(source_st.st_mode))
674                         mkfifo(where, 0644);
675                 else if (S_ISSOCK(source_st.st_mode))
676                         mknod(where, 0644 | S_IFSOCK, 0);
677                 else if (S_ISREG(source_st.st_mode))
678                         touch(where);
679                 else {
680                         log_error("Refusing to create mountpoint for file: %s", *x);
681                         return -ENOTSUP;
682                 }
683
684                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
685                         log_error("mount(%s) failed: %m", where);
686                         return -errno;
687                 }
688
689                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
690                         log_error("mount(%s) failed: %m", where);
691                         return -errno;
692                 }
693         }
694
695         return 0;
696 }
697
698 static int setup_timezone(const char *dest) {
699         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
700         char *z, *y;
701         int r;
702
703         assert(dest);
704
705         /* Fix the timezone, if possible */
706         r = readlink_malloc("/etc/localtime", &p);
707         if (r < 0) {
708                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
709                 return 0;
710         }
711
712         z = path_startswith(p, "../usr/share/zoneinfo/");
713         if (!z)
714                 z = path_startswith(p, "/usr/share/zoneinfo/");
715         if (!z) {
716                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
717                 return 0;
718         }
719
720         where = strappend(dest, "/etc/localtime");
721         if (!where)
722                 return log_oom();
723
724         r = readlink_malloc(where, &q);
725         if (r >= 0) {
726                 y = path_startswith(q, "../usr/share/zoneinfo/");
727                 if (!y)
728                         y = path_startswith(q, "/usr/share/zoneinfo/");
729
730
731                 /* Already pointing to the right place? Then do nothing .. */
732                 if (y && streq(y, z))
733                         return 0;
734         }
735
736         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
737         if (!check)
738                 return log_oom();
739
740         if (access(check, F_OK) < 0) {
741                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
742                 return 0;
743         }
744
745         what = strappend("../usr/share/zoneinfo/", z);
746         if (!what)
747                 return log_oom();
748
749         unlink(where);
750         if (symlink(what, where) < 0) {
751                 log_error("Failed to correct timezone of container: %m");
752                 return 0;
753         }
754
755         return 0;
756 }
757
758 static int setup_resolv_conf(const char *dest) {
759         char _cleanup_free_ *where = NULL;
760
761         assert(dest);
762
763         if (arg_private_network)
764                 return 0;
765
766         /* Fix resolv.conf, if possible */
767         where = strappend(dest, "/etc/resolv.conf");
768         if (!where)
769                 return log_oom();
770
771         /* We don't really care for the results of this really. If it
772          * fails, it fails, but meh... */
773         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
774
775         return 0;
776 }
777
778 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
779
780         snprintf(s, 37,
781                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
782                  SD_ID128_FORMAT_VAL(id));
783
784         return s;
785 }
786
787 static int setup_boot_id(const char *dest) {
788         _cleanup_free_ char *from = NULL, *to = NULL;
789         sd_id128_t rnd = {};
790         char as_uuid[37];
791         int r;
792
793         assert(dest);
794
795         if (arg_share_system)
796                 return 0;
797
798         /* Generate a new randomized boot ID, so that each boot-up of
799          * the container gets a new one */
800
801         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
802         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
803         if (!from || !to)
804                 return log_oom();
805
806         r = sd_id128_randomize(&rnd);
807         if (r < 0) {
808                 log_error("Failed to generate random boot id: %s", strerror(-r));
809                 return r;
810         }
811
812         id128_format_as_uuid(rnd, as_uuid);
813
814         r = write_string_file(from, as_uuid);
815         if (r < 0) {
816                 log_error("Failed to write boot id: %s", strerror(-r));
817                 return r;
818         }
819
820         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
821                 log_error("Failed to bind mount boot id: %m");
822                 r = -errno;
823         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
824                 log_warning("Failed to make boot id read-only: %m");
825
826         unlink(from);
827         return r;
828 }
829
830 static int copy_devnodes(const char *dest) {
831
832         static const char devnodes[] =
833                 "null\0"
834                 "zero\0"
835                 "full\0"
836                 "random\0"
837                 "urandom\0"
838                 "tty\0";
839
840         const char *d;
841         int r = 0;
842         _cleanup_umask_ mode_t u;
843
844         assert(dest);
845
846         u = umask(0000);
847
848         NULSTR_FOREACH(d, devnodes) {
849                 _cleanup_free_ char *from = NULL, *to = NULL;
850                 struct stat st;
851
852                 from = strappend("/dev/", d);
853                 to = strjoin(dest, "/dev/", d, NULL);
854                 if (!from || !to)
855                         return log_oom();
856
857                 if (stat(from, &st) < 0) {
858
859                         if (errno != ENOENT) {
860                                 log_error("Failed to stat %s: %m", from);
861                                 return -errno;
862                         }
863
864                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
865
866                         log_error("%s is not a char or block device, cannot copy", from);
867                         return -EIO;
868
869                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
870
871                         log_error("mknod(%s) failed: %m", dest);
872                         return  -errno;
873                 }
874         }
875
876         return r;
877 }
878
879 static int setup_ptmx(const char *dest) {
880         _cleanup_free_ char *p = NULL;
881
882         p = strappend(dest, "/dev/ptmx");
883         if (!p)
884                 return log_oom();
885
886         if (symlink("pts/ptmx", p) < 0) {
887                 log_error("Failed to create /dev/ptmx symlink: %m");
888                 return -errno;
889         }
890
891         return 0;
892 }
893
894 static int setup_dev_console(const char *dest, const char *console) {
895         _cleanup_umask_ mode_t u;
896         const char *to;
897         struct stat st;
898         int r;
899
900         assert(dest);
901         assert(console);
902
903         u = umask(0000);
904
905         if (stat("/dev/null", &st) < 0) {
906                 log_error("Failed to stat /dev/null: %m");
907                 return -errno;
908         }
909
910         r = chmod_and_chown(console, 0600, 0, 0);
911         if (r < 0) {
912                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
913                 return r;
914         }
915
916         /* We need to bind mount the right tty to /dev/console since
917          * ptys can only exist on pts file systems. To have something
918          * to bind mount things on we create a device node first, and
919          * use /dev/null for that since we the cgroups device policy
920          * allows us to create that freely, while we cannot create
921          * /dev/console. (Note that the major minor doesn't actually
922          * matter here, since we mount it over anyway). */
923
924         to = strappenda(dest, "/dev/console");
925         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
926                 log_error("mknod() for /dev/console failed: %m");
927                 return -errno;
928         }
929
930         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
931                 log_error("Bind mount for /dev/console failed: %m");
932                 return -errno;
933         }
934
935         return 0;
936 }
937
938 static int setup_kmsg(const char *dest, int kmsg_socket) {
939         _cleanup_free_ char *from = NULL, *to = NULL;
940         int r, fd, k;
941         _cleanup_umask_ mode_t u;
942         union {
943                 struct cmsghdr cmsghdr;
944                 uint8_t buf[CMSG_SPACE(sizeof(int))];
945         } control = {};
946         struct msghdr mh = {
947                 .msg_control = &control,
948                 .msg_controllen = sizeof(control),
949         };
950         struct cmsghdr *cmsg;
951
952         assert(dest);
953         assert(kmsg_socket >= 0);
954
955         u = umask(0000);
956
957         /* We create the kmsg FIFO as /dev/kmsg, but immediately
958          * delete it after bind mounting it to /proc/kmsg. While FIFOs
959          * on the reading side behave very similar to /proc/kmsg,
960          * their writing side behaves differently from /dev/kmsg in
961          * that writing blocks when nothing is reading. In order to
962          * avoid any problems with containers deadlocking due to this
963          * we simply make /dev/kmsg unavailable to the container. */
964         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
965             asprintf(&to, "%s/proc/kmsg", dest) < 0)
966                 return log_oom();
967
968         if (mkfifo(from, 0600) < 0) {
969                 log_error("mkfifo() for /dev/kmsg failed: %m");
970                 return -errno;
971         }
972
973         r = chmod_and_chown(from, 0600, 0, 0);
974         if (r < 0) {
975                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
976                 return r;
977         }
978
979         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
980                 log_error("Bind mount for /proc/kmsg failed: %m");
981                 return -errno;
982         }
983
984         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
985         if (fd < 0) {
986                 log_error("Failed to open fifo: %m");
987                 return -errno;
988         }
989
990         cmsg = CMSG_FIRSTHDR(&mh);
991         cmsg->cmsg_level = SOL_SOCKET;
992         cmsg->cmsg_type = SCM_RIGHTS;
993         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
994         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
995
996         mh.msg_controllen = cmsg->cmsg_len;
997
998         /* Store away the fd in the socket, so that it stays open as
999          * long as we run the child */
1000         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1001         safe_close(fd);
1002
1003         if (k < 0) {
1004                 log_error("Failed to send FIFO fd: %m");
1005                 return -errno;
1006         }
1007
1008         /* And now make the FIFO unavailable as /dev/kmsg... */
1009         unlink(from);
1010         return 0;
1011 }
1012
1013 static int setup_hostname(void) {
1014
1015         if (arg_share_system)
1016                 return 0;
1017
1018         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1019                 return -errno;
1020
1021         return 0;
1022 }
1023
1024 static int setup_journal(const char *directory) {
1025         sd_id128_t machine_id, this_id;
1026         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1027         char *id;
1028         int r;
1029
1030         p = strappend(directory, "/etc/machine-id");
1031         if (!p)
1032                 return log_oom();
1033
1034         r = read_one_line_file(p, &b);
1035         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1036                 return 0;
1037         else if (r < 0) {
1038                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1039                 return r;
1040         }
1041
1042         id = strstrip(b);
1043         if (isempty(id) && arg_link_journal == LINK_AUTO)
1044                 return 0;
1045
1046         /* Verify validity */
1047         r = sd_id128_from_string(id, &machine_id);
1048         if (r < 0) {
1049                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1050                 return r;
1051         }
1052
1053         r = sd_id128_get_machine(&this_id);
1054         if (r < 0) {
1055                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1056                 return r;
1057         }
1058
1059         if (sd_id128_equal(machine_id, this_id)) {
1060                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1061                          "Host and machine ids are equal (%s): refusing to link journals", id);
1062                 if (arg_link_journal == LINK_AUTO)
1063                         return 0;
1064                 return
1065                         -EEXIST;
1066         }
1067
1068         if (arg_link_journal == LINK_NO)
1069                 return 0;
1070
1071         free(p);
1072         p = strappend("/var/log/journal/", id);
1073         q = strjoin(directory, "/var/log/journal/", id, NULL);
1074         if (!p || !q)
1075                 return log_oom();
1076
1077         if (path_is_mount_point(p, false) > 0) {
1078                 if (arg_link_journal != LINK_AUTO) {
1079                         log_error("%s: already a mount point, refusing to use for journal", p);
1080                         return -EEXIST;
1081                 }
1082
1083                 return 0;
1084         }
1085
1086         if (path_is_mount_point(q, false) > 0) {
1087                 if (arg_link_journal != LINK_AUTO) {
1088                         log_error("%s: already a mount point, refusing to use for journal", q);
1089                         return -EEXIST;
1090                 }
1091
1092                 return 0;
1093         }
1094
1095         r = readlink_and_make_absolute(p, &d);
1096         if (r >= 0) {
1097                 if ((arg_link_journal == LINK_GUEST ||
1098                      arg_link_journal == LINK_AUTO) &&
1099                     path_equal(d, q)) {
1100
1101                         r = mkdir_p(q, 0755);
1102                         if (r < 0)
1103                                 log_warning("failed to create directory %s: %m", q);
1104                         return 0;
1105                 }
1106
1107                 if (unlink(p) < 0) {
1108                         log_error("Failed to remove symlink %s: %m", p);
1109                         return -errno;
1110                 }
1111         } else if (r == -EINVAL) {
1112
1113                 if (arg_link_journal == LINK_GUEST &&
1114                     rmdir(p) < 0) {
1115
1116                         if (errno == ENOTDIR) {
1117                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1118                                 return r;
1119                         } else {
1120                                 log_error("Failed to remove %s: %m", p);
1121                                 return -errno;
1122                         }
1123                 }
1124         } else if (r != -ENOENT) {
1125                 log_error("readlink(%s) failed: %m", p);
1126                 return r;
1127         }
1128
1129         if (arg_link_journal == LINK_GUEST) {
1130
1131                 if (symlink(q, p) < 0) {
1132                         log_error("Failed to symlink %s to %s: %m", q, p);
1133                         return -errno;
1134                 }
1135
1136                 r = mkdir_p(q, 0755);
1137                 if (r < 0)
1138                         log_warning("failed to create directory %s: %m", q);
1139                 return 0;
1140         }
1141
1142         if (arg_link_journal == LINK_HOST) {
1143                 r = mkdir_p(p, 0755);
1144                 if (r < 0) {
1145                         log_error("Failed to create %s: %m", p);
1146                         return r;
1147                 }
1148
1149         } else if (access(p, F_OK) < 0)
1150                 return 0;
1151
1152         if (dir_is_empty(q) == 0)
1153                 log_warning("%s is not empty, proceeding anyway.", q);
1154
1155         r = mkdir_p(q, 0755);
1156         if (r < 0) {
1157                 log_error("Failed to create %s: %m", q);
1158                 return r;
1159         }
1160
1161         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1162                 log_error("Failed to bind mount journal from host into guest: %m");
1163                 return -errno;
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int setup_kdbus(const char *dest, const char *path) {
1170         const char *p;
1171
1172         if (!path)
1173                 return 0;
1174
1175         p = strappenda(dest, "/dev/kdbus");
1176         if (mkdir(p, 0755) < 0) {
1177                 log_error("Failed to create kdbus path: %m");
1178                 return  -errno;
1179         }
1180
1181         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1182                 log_error("Failed to mount kdbus domain path: %m");
1183                 return -errno;
1184         }
1185
1186         return 0;
1187 }
1188
1189 static int drop_capabilities(void) {
1190         return capability_bounding_set_drop(~arg_retain, false);
1191 }
1192
1193 static int register_machine(pid_t pid) {
1194         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1195         _cleanup_bus_unref_ sd_bus *bus = NULL;
1196         int r;
1197
1198         if (!arg_register)
1199                 return 0;
1200
1201         r = sd_bus_default_system(&bus);
1202         if (r < 0) {
1203                 log_error("Failed to open system bus: %s", strerror(-r));
1204                 return r;
1205         }
1206
1207         if (arg_keep_unit) {
1208                 r = sd_bus_call_method(
1209                                 bus,
1210                                 "org.freedesktop.machine1",
1211                                 "/org/freedesktop/machine1",
1212                                 "org.freedesktop.machine1.Manager",
1213                                 "RegisterMachine",
1214                                 &error,
1215                                 NULL,
1216                                 "sayssus",
1217                                 arg_machine,
1218                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1219                                 "nspawn",
1220                                 "container",
1221                                 (uint32_t) pid,
1222                                 strempty(arg_directory));
1223         } else {
1224                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1225
1226                 r = sd_bus_message_new_method_call(
1227                                 bus,
1228                                 &m,
1229                                 "org.freedesktop.machine1",
1230                                 "/org/freedesktop/machine1",
1231                                 "org.freedesktop.machine1.Manager",
1232                                 "CreateMachine");
1233                 if (r < 0) {
1234                         log_error("Failed to create message: %s", strerror(-r));
1235                         return r;
1236                 }
1237
1238                 r = sd_bus_message_append(
1239                                 m,
1240                                 "sayssus",
1241                                 arg_machine,
1242                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1243                                 "nspawn",
1244                                 "container",
1245                                 (uint32_t) pid,
1246                                 strempty(arg_directory));
1247                 if (r < 0) {
1248                         log_error("Failed to append message arguments: %s", strerror(-r));
1249                         return r;
1250                 }
1251
1252                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1253                 if (r < 0) {
1254                         log_error("Failed to open container: %s", strerror(-r));
1255                         return r;
1256                 }
1257
1258                 if (!isempty(arg_slice)) {
1259                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1260                         if (r < 0) {
1261                                 log_error("Failed to append slice: %s", strerror(-r));
1262                                 return r;
1263                         }
1264                 }
1265
1266                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1267                 if (r < 0) {
1268                         log_error("Failed to add device policy: %s", strerror(-r));
1269                         return r;
1270                 }
1271
1272                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1273                                           /* Allow the container to
1274                                            * access and create the API
1275                                            * device nodes, so that
1276                                            * PrivateDevices= in the
1277                                            * container can work
1278                                            * fine */
1279                                           "/dev/null", "rwm",
1280                                           "/dev/zero", "rwm",
1281                                           "/dev/full", "rwm",
1282                                           "/dev/random", "rwm",
1283                                           "/dev/urandom", "rwm",
1284                                           "/dev/tty", "rwm",
1285                                           /* Allow the container
1286                                            * access to ptys. However,
1287                                            * do not permit the
1288                                            * container to ever create
1289                                            * these device nodes. */
1290                                           "/dev/pts/ptmx", "rw",
1291                                           "char-pts", "rw",
1292                                           /* Allow the container
1293                                            * access to all kdbus
1294                                            * devices. Again, the
1295                                            * container cannot create
1296                                            * these nodes, only use
1297                                            * them. We use a pretty
1298                                            * open match here, so that
1299                                            * the kernel API can still
1300                                            * change. */
1301                                           "char-kdbus", "rw",
1302                                           "char-kdbus/*", "rw");
1303                 if (r < 0) {
1304                         log_error("Failed to add device whitelist: %s", strerror(-r));
1305                         return r;
1306                 }
1307
1308                 r = sd_bus_message_close_container(m);
1309                 if (r < 0) {
1310                         log_error("Failed to close container: %s", strerror(-r));
1311                         return r;
1312                 }
1313
1314                 r = sd_bus_call(bus, m, 0, &error, NULL);
1315         }
1316
1317         if (r < 0) {
1318                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1319                 return r;
1320         }
1321
1322         return 0;
1323 }
1324
1325 static int terminate_machine(pid_t pid) {
1326         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1327         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1328         _cleanup_bus_unref_ sd_bus *bus = NULL;
1329         const char *path;
1330         int r;
1331
1332         if (!arg_register)
1333                 return 0;
1334
1335         r = sd_bus_default_system(&bus);
1336         if (r < 0) {
1337                 log_error("Failed to open system bus: %s", strerror(-r));
1338                 return r;
1339         }
1340
1341         r = sd_bus_call_method(
1342                         bus,
1343                         "org.freedesktop.machine1",
1344                         "/org/freedesktop/machine1",
1345                         "org.freedesktop.machine1.Manager",
1346                         "GetMachineByPID",
1347                         &error,
1348                         &reply,
1349                         "u",
1350                         (uint32_t) pid);
1351         if (r < 0) {
1352                 /* Note that the machine might already have been
1353                  * cleaned up automatically, hence don't consider it a
1354                  * failure if we cannot get the machine object. */
1355                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1356                 return 0;
1357         }
1358
1359         r = sd_bus_message_read(reply, "o", &path);
1360         if (r < 0)
1361                 return bus_log_parse_error(r);
1362
1363         r = sd_bus_call_method(
1364                         bus,
1365                         "org.freedesktop.machine1",
1366                         path,
1367                         "org.freedesktop.machine1.Machine",
1368                         "Terminate",
1369                         &error,
1370                         NULL,
1371                         NULL);
1372         if (r < 0) {
1373                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1374                 return 0;
1375         }
1376
1377         return 0;
1378 }
1379
1380 static int reset_audit_loginuid(void) {
1381         _cleanup_free_ char *p = NULL;
1382         int r;
1383
1384         if (arg_share_system)
1385                 return 0;
1386
1387         r = read_one_line_file("/proc/self/loginuid", &p);
1388         if (r == -ENOENT)
1389                 return 0;
1390         if (r < 0) {
1391                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1392                 return r;
1393         }
1394
1395         /* Already reset? */
1396         if (streq(p, "4294967295"))
1397                 return 0;
1398
1399         r = write_string_file("/proc/self/loginuid", "4294967295");
1400         if (r < 0) {
1401                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1402                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1403                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1404                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1405                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1406
1407                 sleep(5);
1408         }
1409
1410         return 0;
1411 }
1412
1413 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1414
1415 static int get_mac(struct ether_addr *mac) {
1416         int r;
1417
1418         uint8_t result[8];
1419         size_t l, sz;
1420         uint8_t *v;
1421
1422         l = strlen(arg_machine);
1423         sz = sizeof(sd_id128_t) + l;
1424         v = alloca(sz);
1425
1426         /* fetch some persistent data unique to the host */
1427         r = sd_id128_get_machine((sd_id128_t*) v);
1428         if (r < 0)
1429                 return r;
1430
1431         /* combine with some data unique (on this host) to this
1432          * container instance */
1433         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1434
1435         /* Let's hash the host machine ID plus the container name. We
1436          * use a fixed, but originally randomly created hash key here. */
1437         siphash24(result, v, sz, HASH_KEY.bytes);
1438
1439         assert_cc(ETH_ALEN <= sizeof(result));
1440         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1441
1442         /* see eth_random_addr in the kernel */
1443         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1444         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1445
1446         return 0;
1447 }
1448
1449 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1450         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1451         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1452         struct ether_addr mac;
1453         int r;
1454
1455         if (!arg_private_network)
1456                 return 0;
1457
1458         if (!arg_network_veth)
1459                 return 0;
1460
1461         /* Use two different interface name prefixes depending whether
1462          * we are in bridge mode or not. */
1463         if (arg_network_bridge)
1464                 memcpy(iface_name, "vb-", 3);
1465         else
1466                 memcpy(iface_name, "ve-", 3);
1467         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1468
1469         r = get_mac(&mac);
1470         if (r < 0) {
1471                 log_error("Failed to generate predictable MAC address for host0");
1472                 return r;
1473         }
1474
1475         r = sd_rtnl_open(&rtnl, 0);
1476         if (r < 0) {
1477                 log_error("Failed to connect to netlink: %s", strerror(-r));
1478                 return r;
1479         }
1480
1481         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1482         if (r < 0) {
1483                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1484                 return r;
1485         }
1486
1487         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1488         if (r < 0) {
1489                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1490                 return r;
1491         }
1492
1493         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1494         if (r < 0) {
1495                 log_error("Failed to open netlink container: %s", strerror(-r));
1496                 return r;
1497         }
1498
1499         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1500         if (r < 0) {
1501                 log_error("Failed to open netlink container: %s", strerror(-r));
1502                 return r;
1503         }
1504
1505         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1506         if (r < 0) {
1507                 log_error("Failed to open netlink container: %s", strerror(-r));
1508                 return r;
1509         }
1510
1511         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1512         if (r < 0) {
1513                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1514                 return r;
1515         }
1516
1517         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1518         if (r < 0) {
1519                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1520                 return r;
1521         }
1522
1523         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1524         if (r < 0) {
1525                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1526                 return r;
1527         }
1528
1529         r = sd_rtnl_message_close_container(m);
1530         if (r < 0) {
1531                 log_error("Failed to close netlink container: %s", strerror(-r));
1532                 return r;
1533         }
1534
1535         r = sd_rtnl_message_close_container(m);
1536         if (r < 0) {
1537                 log_error("Failed to close netlink container: %s", strerror(-r));
1538                 return r;
1539         }
1540
1541         r = sd_rtnl_message_close_container(m);
1542         if (r < 0) {
1543                 log_error("Failed to close netlink container: %s", strerror(-r));
1544                 return r;
1545         }
1546
1547         r = sd_rtnl_call(rtnl, m, 0, NULL);
1548         if (r < 0) {
1549                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1550                 return r;
1551         }
1552
1553         return 0;
1554 }
1555
1556 static int setup_bridge(const char veth_name[]) {
1557         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1558         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1559         int r, bridge;
1560
1561         if (!arg_private_network)
1562                 return 0;
1563
1564         if (!arg_network_veth)
1565                 return 0;
1566
1567         if (!arg_network_bridge)
1568                 return 0;
1569
1570         bridge = (int) if_nametoindex(arg_network_bridge);
1571         if (bridge <= 0) {
1572                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1573                 return -errno;
1574         }
1575
1576         r = sd_rtnl_open(&rtnl, 0);
1577         if (r < 0) {
1578                 log_error("Failed to connect to netlink: %s", strerror(-r));
1579                 return r;
1580         }
1581
1582         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1583         if (r < 0) {
1584                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1585                 return r;
1586         }
1587
1588         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1589         if (r < 0) {
1590                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1591                 return r;
1592         }
1593
1594         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1595         if (r < 0) {
1596                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1597                 return r;
1598         }
1599
1600         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1601         if (r < 0) {
1602                 log_error("Failed to add netlink master field: %s", strerror(-r));
1603                 return r;
1604         }
1605
1606         r = sd_rtnl_call(rtnl, m, 0, NULL);
1607         if (r < 0) {
1608                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1609                 return r;
1610         }
1611
1612         return 0;
1613 }
1614
1615 static int parse_interface(struct udev *udev, const char *name) {
1616         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1617         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1618         int ifi;
1619
1620         ifi = (int) if_nametoindex(name);
1621         if (ifi <= 0) {
1622                 log_error("Failed to resolve interface %s: %m", name);
1623                 return -errno;
1624         }
1625
1626         sprintf(ifi_str, "n%i", ifi);
1627         d = udev_device_new_from_device_id(udev, ifi_str);
1628         if (!d) {
1629                 log_error("Failed to get udev device for interface %s: %m", name);
1630                 return -errno;
1631         }
1632
1633         if (udev_device_get_is_initialized(d) <= 0) {
1634                 log_error("Network interface %s is not initialized yet.", name);
1635                 return -EBUSY;
1636         }
1637
1638         return ifi;
1639 }
1640
1641 static int move_network_interfaces(pid_t pid) {
1642         _cleanup_udev_unref_ struct udev *udev = NULL;
1643         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1644         char **i;
1645         int r;
1646
1647         if (!arg_private_network)
1648                 return 0;
1649
1650         if (strv_isempty(arg_network_interfaces))
1651                 return 0;
1652
1653         r = sd_rtnl_open(&rtnl, 0);
1654         if (r < 0) {
1655                 log_error("Failed to connect to netlink: %s", strerror(-r));
1656                 return r;
1657         }
1658
1659         udev = udev_new();
1660         if (!udev) {
1661                 log_error("Failed to connect to udev.");
1662                 return -ENOMEM;
1663         }
1664
1665         STRV_FOREACH(i, arg_network_interfaces) {
1666                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1667                 int ifi;
1668
1669                 ifi = parse_interface(udev, *i);
1670                 if (ifi < 0)
1671                         return ifi;
1672
1673                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1674                 if (r < 0) {
1675                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1676                         return r;
1677                 }
1678
1679                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1680                 if (r < 0) {
1681                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1682                         return r;
1683                 }
1684
1685                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1686                 if (r < 0) {
1687                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1688                         return r;
1689                 }
1690         }
1691
1692         return 0;
1693 }
1694
1695 static int setup_macvlan(pid_t pid) {
1696         _cleanup_udev_unref_ struct udev *udev = NULL;
1697         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1698         char **i;
1699         int r;
1700
1701         if (!arg_private_network)
1702                 return 0;
1703
1704         if (strv_isempty(arg_network_macvlan))
1705                 return 0;
1706
1707         r = sd_rtnl_open(&rtnl, 0);
1708         if (r < 0) {
1709                 log_error("Failed to connect to netlink: %s", strerror(-r));
1710                 return r;
1711         }
1712
1713         udev = udev_new();
1714         if (!udev) {
1715                 log_error("Failed to connect to udev.");
1716                 return -ENOMEM;
1717         }
1718
1719         STRV_FOREACH(i, arg_network_macvlan) {
1720                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1721                 _cleanup_free_ char *n = NULL;
1722                 int ifi;
1723
1724                 ifi = parse_interface(udev, *i);
1725                 if (ifi < 0)
1726                         return ifi;
1727
1728                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1729                 if (r < 0) {
1730                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1731                         return r;
1732                 }
1733
1734                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1735                 if (r < 0) {
1736                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1737                         return r;
1738                 }
1739
1740                 n = strappend("mv-", *i);
1741                 if (!n)
1742                         return log_oom();
1743
1744                 strshorten(n, IFNAMSIZ-1);
1745
1746                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1747                 if (r < 0) {
1748                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1749                         return r;
1750                 }
1751
1752                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1753                 if (r < 0) {
1754                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1755                         return r;
1756                 }
1757
1758                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1759                 if (r < 0) {
1760                         log_error("Failed to open netlink container: %s", strerror(-r));
1761                         return r;
1762                 }
1763
1764                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1765                 if (r < 0) {
1766                         log_error("Failed to open netlink container: %s", strerror(-r));
1767                         return r;
1768                 }
1769
1770                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1771                 if (r < 0) {
1772                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1773                         return r;
1774                 }
1775
1776                 r = sd_rtnl_message_close_container(m);
1777                 if (r < 0) {
1778                         log_error("Failed to close netlink container: %s", strerror(-r));
1779                         return r;
1780                 }
1781
1782                 r = sd_rtnl_message_close_container(m);
1783                 if (r < 0) {
1784                         log_error("Failed to close netlink container: %s", strerror(-r));
1785                         return r;
1786                 }
1787
1788                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1789                 if (r < 0) {
1790                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1791                         return r;
1792                 }
1793         }
1794
1795         return 0;
1796 }
1797
1798 static int audit_still_doesnt_work_in_containers(void) {
1799
1800 #ifdef HAVE_SECCOMP
1801         scmp_filter_ctx seccomp;
1802         int r;
1803
1804         /*
1805            Audit is broken in containers, much of the userspace audit
1806            hookup will fail if running inside a container. We don't
1807            care and just turn off creation of audit sockets.
1808
1809            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1810            with EAFNOSUPPORT which audit userspace uses as indication
1811            that audit is disabled in the kernel.
1812          */
1813
1814         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1815         if (!seccomp)
1816                 return log_oom();
1817
1818         r = seccomp_add_secondary_archs(seccomp);
1819         if (r < 0) {
1820                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1821                 goto finish;
1822         }
1823
1824         r = seccomp_rule_add(
1825                         seccomp,
1826                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1827                         SCMP_SYS(socket),
1828                         2,
1829                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1830                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1831         if (r < 0) {
1832                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1833                 goto finish;
1834         }
1835
1836         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1837         if (r < 0) {
1838                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1839                 goto finish;
1840         }
1841
1842         r = seccomp_load(seccomp);
1843         if (r < 0)
1844                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1845
1846 finish:
1847         seccomp_release(seccomp);
1848         return r;
1849 #else
1850         return 0;
1851 #endif
1852
1853 }
1854
1855 static int setup_image(char **device_path, int *loop_nr) {
1856         struct loop_info64 info = {
1857                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1858         };
1859         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1860         _cleanup_free_ char* loopdev = NULL;
1861         struct stat st;
1862         int r, nr;
1863
1864         assert(device_path);
1865         assert(loop_nr);
1866
1867         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1868         if (fd < 0) {
1869                 log_error("Failed to open %s: %m", arg_image);
1870                 return -errno;
1871         }
1872
1873         if (fstat(fd, &st) < 0) {
1874                 log_error("Failed to stat %s: %m", arg_image);
1875                 return -errno;
1876         }
1877
1878         if (S_ISBLK(st.st_mode)) {
1879                 char *p;
1880
1881                 p = strdup(arg_image);
1882                 if (!p)
1883                         return log_oom();
1884
1885                 *device_path = p;
1886
1887                 *loop_nr = -1;
1888
1889                 r = fd;
1890                 fd = -1;
1891
1892                 return r;
1893         }
1894
1895         if (!S_ISREG(st.st_mode)) {
1896                 log_error("%s is not a regular file or block device: %m", arg_image);
1897                 return -EINVAL;
1898         }
1899
1900         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1901         if (control < 0) {
1902                 log_error("Failed to open /dev/loop-control: %m");
1903                 return -errno;
1904         }
1905
1906         nr = ioctl(control, LOOP_CTL_GET_FREE);
1907         if (nr < 0) {
1908                 log_error("Failed to allocate loop device: %m");
1909                 return -errno;
1910         }
1911
1912         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1913                 return log_oom();
1914
1915         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1916         if (loop < 0) {
1917                 log_error("Failed to open loop device %s: %m", loopdev);
1918                 return -errno;
1919         }
1920
1921         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1922                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1923                 return -errno;
1924         }
1925
1926         if (arg_read_only)
1927                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1928
1929         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1930                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1931                 return -errno;
1932         }
1933
1934         *device_path = loopdev;
1935         loopdev = NULL;
1936
1937         *loop_nr = nr;
1938
1939         r = loop;
1940         loop = -1;
1941
1942         return r;
1943 }
1944
1945 static int dissect_image(
1946                 int fd,
1947                 char **root_device, bool *root_device_rw,
1948                 char **home_device, bool *home_device_rw,
1949                 char **srv_device, bool *srv_device_rw,
1950                 bool *secondary) {
1951
1952 #ifdef HAVE_BLKID
1953         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1954         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1955         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1956         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1957         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1958         _cleanup_udev_unref_ struct udev *udev = NULL;
1959         struct udev_list_entry *first, *item;
1960         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1961         const char *pttype = NULL;
1962         blkid_partlist pl;
1963         struct stat st;
1964         int r;
1965
1966         assert(fd >= 0);
1967         assert(root_device);
1968         assert(home_device);
1969         assert(srv_device);
1970         assert(secondary);
1971
1972         b = blkid_new_probe();
1973         if (!b)
1974                 return log_oom();
1975
1976         errno = 0;
1977         r = blkid_probe_set_device(b, fd, 0, 0);
1978         if (r != 0) {
1979                 if (errno == 0)
1980                         return log_oom();
1981
1982                 log_error("Failed to set device on blkid probe: %m");
1983                 return -errno;
1984         }
1985
1986         blkid_probe_enable_partitions(b, 1);
1987         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1988
1989         errno = 0;
1990         r = blkid_do_safeprobe(b);
1991         if (r == -2 || r == 1) {
1992                 log_error("Failed to identify any partition table on %s.\n"
1993                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1994                 return -EINVAL;
1995         } else if (r != 0) {
1996                 if (errno == 0)
1997                         errno = EIO;
1998                 log_error("Failed to probe: %m");
1999                 return -errno;
2000         }
2001
2002         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2003         if (!streq_ptr(pttype, "gpt")) {
2004                 log_error("Image %s does not carry a GUID Partition Table.\n"
2005                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2006                 return -EINVAL;
2007         }
2008
2009         errno = 0;
2010         pl = blkid_probe_get_partitions(b);
2011         if (!pl) {
2012                 if (errno == 0)
2013                         return log_oom();
2014
2015                 log_error("Failed to list partitions of %s", arg_image);
2016                 return -errno;
2017         }
2018
2019         udev = udev_new();
2020         if (!udev)
2021                 return log_oom();
2022
2023         if (fstat(fd, &st) < 0) {
2024                 log_error("Failed to stat block device: %m");
2025                 return -errno;
2026         }
2027
2028         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2029         if (!d)
2030                 return log_oom();
2031
2032         e = udev_enumerate_new(udev);
2033         if (!e)
2034                 return log_oom();
2035
2036         r = udev_enumerate_add_match_parent(e, d);
2037         if (r < 0)
2038                 return log_oom();
2039
2040         r = udev_enumerate_scan_devices(e);
2041         if (r < 0) {
2042                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2043                 return r;
2044         }
2045
2046         first = udev_enumerate_get_list_entry(e);
2047         udev_list_entry_foreach(item, first) {
2048                 _cleanup_udev_device_unref_ struct udev_device *q;
2049                 const char *stype, *node;
2050                 unsigned long long flags;
2051                 sd_id128_t type_id;
2052                 blkid_partition pp;
2053                 dev_t qn;
2054                 int nr;
2055
2056                 errno = 0;
2057                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2058                 if (!q) {
2059                         if (!errno)
2060                                 errno = ENOMEM;
2061
2062                         log_error("Failed to get partition device of %s: %m", arg_image);
2063                         return -errno;
2064                 }
2065
2066                 qn = udev_device_get_devnum(q);
2067                 if (major(qn) == 0)
2068                         continue;
2069
2070                 if (st.st_rdev == qn)
2071                         continue;
2072
2073                 node = udev_device_get_devnode(q);
2074                 if (!node)
2075                         continue;
2076
2077                 pp = blkid_partlist_devno_to_partition(pl, qn);
2078                 if (!pp)
2079                         continue;
2080
2081                 flags = blkid_partition_get_flags(pp);
2082                 if (flags & GPT_FLAG_NO_AUTO)
2083                         continue;
2084
2085                 nr = blkid_partition_get_partno(pp);
2086                 if (nr < 0)
2087                         continue;
2088
2089                 stype = blkid_partition_get_type_string(pp);
2090                 if (!stype)
2091                         continue;
2092
2093                 if (sd_id128_from_string(stype, &type_id) < 0)
2094                         continue;
2095
2096                 if (sd_id128_equal(type_id, GPT_HOME)) {
2097
2098                         if (home && nr >= home_nr)
2099                                 continue;
2100
2101                         home_nr = nr;
2102                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2103
2104                         free(home);
2105                         home = strdup(node);
2106                         if (!home)
2107                                 return log_oom();
2108                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2109
2110                         if (srv && nr >= srv_nr)
2111                                 continue;
2112
2113                         srv_nr = nr;
2114                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2115
2116                         free(srv);
2117                         srv = strdup(node);
2118                         if (!srv)
2119                                 return log_oom();
2120                 }
2121 #ifdef GPT_ROOT_NATIVE
2122                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2123
2124                         if (root && nr >= root_nr)
2125                                 continue;
2126
2127                         root_nr = nr;
2128                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2129
2130                         free(root);
2131                         root = strdup(node);
2132                         if (!root)
2133                                 return log_oom();
2134                 }
2135 #endif
2136 #ifdef GPT_ROOT_SECONDARY
2137                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2138
2139                         if (secondary_root && nr >= secondary_root_nr)
2140                                 continue;
2141
2142                         secondary_root_nr = nr;
2143                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2144
2145
2146                         free(secondary_root);
2147                         secondary_root = strdup(node);
2148                         if (!secondary_root)
2149                                 return log_oom();
2150                 }
2151 #endif
2152         }
2153
2154         if (!root && !secondary_root) {
2155                 log_error("Failed to identify root partition in disk image %s.\n"
2156                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2157                 return -EINVAL;
2158         }
2159
2160         if (root) {
2161                 *root_device = root;
2162                 root = NULL;
2163
2164                 *root_device_rw = root_rw;
2165                 *secondary = false;
2166         } else if (secondary_root) {
2167                 *root_device = secondary_root;
2168                 secondary_root = NULL;
2169
2170                 *root_device_rw = secondary_root_rw;
2171                 *secondary = true;
2172         }
2173
2174         if (home) {
2175                 *home_device = home;
2176                 home = NULL;
2177
2178                 *home_device_rw = home_rw;
2179         }
2180
2181         if (srv) {
2182                 *srv_device = srv;
2183                 srv = NULL;
2184
2185                 *srv_device_rw = srv_rw;
2186         }
2187
2188         return 0;
2189 #else
2190         log_error("--image= is not supported, compiled without blkid support.");
2191         return -ENOTSUP;
2192 #endif
2193 }
2194
2195 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2196 #ifdef HAVE_BLKID
2197         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2198         const char *fstype, *p;
2199         int r;
2200
2201         assert(what);
2202         assert(where);
2203
2204         if (arg_read_only)
2205                 rw = false;
2206
2207         if (directory)
2208                 p = strappenda(where, directory);
2209         else
2210                 p = where;
2211
2212         errno = 0;
2213         b = blkid_new_probe_from_filename(what);
2214         if (!b) {
2215                 if (errno == 0)
2216                         return log_oom();
2217                 log_error("Failed to allocate prober for %s: %m", what);
2218                 return -errno;
2219         }
2220
2221         blkid_probe_enable_superblocks(b, 1);
2222         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2223
2224         errno = 0;
2225         r = blkid_do_safeprobe(b);
2226         if (r == -1 || r == 1) {
2227                 log_error("Cannot determine file system type of %s", what);
2228                 return -EINVAL;
2229         } else if (r != 0) {
2230                 if (errno == 0)
2231                         errno = EIO;
2232                 log_error("Failed to probe %s: %m", what);
2233                 return -errno;
2234         }
2235
2236         errno = 0;
2237         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2238                 if (errno == 0)
2239                         errno = EINVAL;
2240                 log_error("Failed to determine file system type of %s", what);
2241                 return -errno;
2242         }
2243
2244         if (streq(fstype, "crypto_LUKS")) {
2245                 log_error("nspawn currently does not support LUKS disk images.");
2246                 return -ENOTSUP;
2247         }
2248
2249         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2250                 log_error("Failed to mount %s: %m", what);
2251                 return -errno;
2252         }
2253
2254         return 0;
2255 #else
2256         log_error("--image= is not supported, compiled without blkid support.");
2257         return -ENOTSUP;
2258 #endif
2259 }
2260
2261 static int mount_devices(
2262                 const char *where,
2263                 const char *root_device, bool root_device_rw,
2264                 const char *home_device, bool home_device_rw,
2265                 const char *srv_device, bool srv_device_rw) {
2266         int r;
2267
2268         assert(where);
2269
2270         if (root_device) {
2271                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2272                 if (r < 0) {
2273                         log_error("Failed to mount root directory: %s", strerror(-r));
2274                         return r;
2275                 }
2276         }
2277
2278         if (home_device) {
2279                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2280                 if (r < 0) {
2281                         log_error("Failed to mount home directory: %s", strerror(-r));
2282                         return r;
2283                 }
2284         }
2285
2286         if (srv_device) {
2287                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2288                 if (r < 0) {
2289                         log_error("Failed to mount server data directory: %s", strerror(-r));
2290                         return r;
2291                 }
2292         }
2293
2294         return 0;
2295 }
2296
2297 static void loop_remove(int nr, int *image_fd) {
2298         _cleanup_close_ int control = -1;
2299
2300         if (nr < 0)
2301                 return;
2302
2303         if (image_fd && *image_fd >= 0) {
2304                 ioctl(*image_fd, LOOP_CLR_FD);
2305                 *image_fd = safe_close(*image_fd);
2306         }
2307
2308         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2309         if (control < 0)
2310                 return;
2311
2312         ioctl(control, LOOP_CTL_REMOVE, nr);
2313 }
2314
2315 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2316         int pipe_fds[2];
2317         pid_t pid;
2318
2319         assert(database);
2320         assert(key);
2321         assert(rpid);
2322
2323         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2324                 log_error("Failed to allocate pipe: %m");
2325                 return -errno;
2326         }
2327
2328         pid = fork();
2329         if (pid < 0) {
2330                 log_error("Failed to fork getent child: %m");
2331                 return -errno;
2332         } else if (pid == 0) {
2333                 int nullfd;
2334                 char *empty_env = NULL;
2335
2336                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2337                         _exit(EXIT_FAILURE);
2338
2339                 if (pipe_fds[0] > 2)
2340                         safe_close(pipe_fds[0]);
2341                 if (pipe_fds[1] > 2)
2342                         safe_close(pipe_fds[1]);
2343
2344                 nullfd = open("/dev/null", O_RDWR);
2345                 if (nullfd < 0)
2346                         _exit(EXIT_FAILURE);
2347
2348                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2349                         _exit(EXIT_FAILURE);
2350
2351                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2352                         _exit(EXIT_FAILURE);
2353
2354                 if (nullfd > 2)
2355                         safe_close(nullfd);
2356
2357                 reset_all_signal_handlers();
2358                 close_all_fds(NULL, 0);
2359
2360                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2361                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2362                 _exit(EXIT_FAILURE);
2363         }
2364
2365         pipe_fds[1] = safe_close(pipe_fds[1]);
2366
2367         *rpid = pid;
2368
2369         return pipe_fds[0];
2370 }
2371
2372 static int change_uid_gid(char **_home) {
2373         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2374         _cleanup_free_ uid_t *uids = NULL;
2375         _cleanup_free_ char *home = NULL;
2376         _cleanup_fclose_ FILE *f = NULL;
2377         _cleanup_close_ int fd = -1;
2378         unsigned n_uids = 0;
2379         size_t sz = 0, l;
2380         uid_t uid;
2381         gid_t gid;
2382         pid_t pid;
2383         int r;
2384
2385         assert(_home);
2386
2387         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2388                 /* Reset everything fully to 0, just in case */
2389
2390                 if (setgroups(0, NULL) < 0) {
2391                         log_error("setgroups() failed: %m");
2392                         return -errno;
2393                 }
2394
2395                 if (setresgid(0, 0, 0) < 0) {
2396                         log_error("setregid() failed: %m");
2397                         return -errno;
2398                 }
2399
2400                 if (setresuid(0, 0, 0) < 0) {
2401                         log_error("setreuid() failed: %m");
2402                         return -errno;
2403                 }
2404
2405                 *_home = NULL;
2406                 return 0;
2407         }
2408
2409         /* First, get user credentials */
2410         fd = spawn_getent("passwd", arg_user, &pid);
2411         if (fd < 0)
2412                 return fd;
2413
2414         f = fdopen(fd, "r");
2415         if (!f)
2416                 return log_oom();
2417         fd = -1;
2418
2419         if (!fgets(line, sizeof(line), f)) {
2420
2421                 if (!ferror(f)) {
2422                         log_error("Failed to resolve user %s.", arg_user);
2423                         return -ESRCH;
2424                 }
2425
2426                 log_error("Failed to read from getent: %m");
2427                 return -errno;
2428         }
2429
2430         truncate_nl(line);
2431
2432         wait_for_terminate_and_warn("getent passwd", pid);
2433
2434         x = strchr(line, ':');
2435         if (!x) {
2436                 log_error("/etc/passwd entry has invalid user field.");
2437                 return -EIO;
2438         }
2439
2440         u = strchr(x+1, ':');
2441         if (!u) {
2442                 log_error("/etc/passwd entry has invalid password field.");
2443                 return -EIO;
2444         }
2445
2446         u++;
2447         g = strchr(u, ':');
2448         if (!g) {
2449                 log_error("/etc/passwd entry has invalid UID field.");
2450                 return -EIO;
2451         }
2452
2453         *g = 0;
2454         g++;
2455         x = strchr(g, ':');
2456         if (!x) {
2457                 log_error("/etc/passwd entry has invalid GID field.");
2458                 return -EIO;
2459         }
2460
2461         *x = 0;
2462         h = strchr(x+1, ':');
2463         if (!h) {
2464                 log_error("/etc/passwd entry has invalid GECOS field.");
2465                 return -EIO;
2466         }
2467
2468         h++;
2469         x = strchr(h, ':');
2470         if (!x) {
2471                 log_error("/etc/passwd entry has invalid home directory field.");
2472                 return -EIO;
2473         }
2474
2475         *x = 0;
2476
2477         r = parse_uid(u, &uid);
2478         if (r < 0) {
2479                 log_error("Failed to parse UID of user.");
2480                 return -EIO;
2481         }
2482
2483         r = parse_gid(g, &gid);
2484         if (r < 0) {
2485                 log_error("Failed to parse GID of user.");
2486                 return -EIO;
2487         }
2488
2489         home = strdup(h);
2490         if (!home)
2491                 return log_oom();
2492
2493         /* Second, get group memberships */
2494         fd = spawn_getent("initgroups", arg_user, &pid);
2495         if (fd < 0)
2496                 return fd;
2497
2498         fclose(f);
2499         f = fdopen(fd, "r");
2500         if (!f)
2501                 return log_oom();
2502         fd = -1;
2503
2504         if (!fgets(line, sizeof(line), f)) {
2505                 if (!ferror(f)) {
2506                         log_error("Failed to resolve user %s.", arg_user);
2507                         return -ESRCH;
2508                 }
2509
2510                 log_error("Failed to read from getent: %m");
2511                 return -errno;
2512         }
2513
2514         truncate_nl(line);
2515
2516         wait_for_terminate_and_warn("getent initgroups", pid);
2517
2518         /* Skip over the username and subsequent separator whitespace */
2519         x = line;
2520         x += strcspn(x, WHITESPACE);
2521         x += strspn(x, WHITESPACE);
2522
2523         FOREACH_WORD(w, l, x, state) {
2524                 char c[l+1];
2525
2526                 memcpy(c, w, l);
2527                 c[l] = 0;
2528
2529                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2530                         return log_oom();
2531
2532                 r = parse_uid(c, &uids[n_uids++]);
2533                 if (r < 0) {
2534                         log_error("Failed to parse group data from getent.");
2535                         return -EIO;
2536                 }
2537         }
2538
2539         r = mkdir_parents(home, 0775);
2540         if (r < 0) {
2541                 log_error("Failed to make home root directory: %s", strerror(-r));
2542                 return r;
2543         }
2544
2545         r = mkdir_safe(home, 0755, uid, gid);
2546         if (r < 0 && r != -EEXIST) {
2547                 log_error("Failed to make home directory: %s", strerror(-r));
2548                 return r;
2549         }
2550
2551         fchown(STDIN_FILENO, uid, gid);
2552         fchown(STDOUT_FILENO, uid, gid);
2553         fchown(STDERR_FILENO, uid, gid);
2554
2555         if (setgroups(n_uids, uids) < 0) {
2556                 log_error("Failed to set auxiliary groups: %m");
2557                 return -errno;
2558         }
2559
2560         if (setresgid(gid, gid, gid) < 0) {
2561                 log_error("setregid() failed: %m");
2562                 return -errno;
2563         }
2564
2565         if (setresuid(uid, uid, uid) < 0) {
2566                 log_error("setreuid() failed: %m");
2567                 return -errno;
2568         }
2569
2570         if (_home) {
2571                 *_home = home;
2572                 home = NULL;
2573         }
2574
2575         return 0;
2576 }
2577
2578 /*
2579  * Return 0 in case the container is being rebooted, has been shut
2580  * down or exited successfully. On failures a negative value is
2581  * returned.
2582  *
2583  * The status of the container "CONTAINER_TERMINATED" or
2584  * "CONTAINER_REBOOTED" will be saved in the container argument
2585  */
2586 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2587         int r;
2588         siginfo_t status;
2589
2590         r = wait_for_terminate(pid, &status);
2591         if (r < 0)
2592                 return r;
2593
2594         switch (status.si_code) {
2595         case CLD_EXITED:
2596                 r = status.si_status;
2597                 if (r == 0) {
2598                         if (!arg_quiet)
2599                                 log_debug("Container %s exited successfully.",
2600                                           arg_machine);
2601
2602                         *container = CONTAINER_TERMINATED;
2603                 } else {
2604                         log_error("Container %s failed with error code %i.",
2605                                   arg_machine, status.si_status);
2606                         r = -1;
2607                 }
2608                 break;
2609
2610         case CLD_KILLED:
2611                 if (status.si_status == SIGINT) {
2612                         if (!arg_quiet)
2613                                 log_info("Container %s has been shut down.",
2614                                          arg_machine);
2615
2616                         *container = CONTAINER_TERMINATED;
2617                         r = 0;
2618                         break;
2619                 } else if (status.si_status == SIGHUP) {
2620                         if (!arg_quiet)
2621                                 log_info("Container %s is being rebooted.",
2622                                          arg_machine);
2623
2624                         *container = CONTAINER_REBOOTED;
2625                         r = 0;
2626                         break;
2627                 }
2628                 /* CLD_KILLED fallthrough */
2629
2630         case CLD_DUMPED:
2631                 log_error("Container %s terminated by signal %s.",
2632                           arg_machine, signal_to_string(status.si_status));
2633                 r = -1;
2634                 break;
2635
2636         default:
2637                 log_error("Container %s failed due to unknown reason.",
2638                           arg_machine);
2639                 r = -1;
2640                 break;
2641         }
2642
2643         return r;
2644 }
2645
2646 static void nop_handler(int sig) {}
2647
2648 int main(int argc, char *argv[]) {
2649
2650         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2651         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2652         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2653         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2654         _cleanup_fdset_free_ FDSet *fds = NULL;
2655         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2656         const char *console = NULL;
2657         char veth_name[IFNAMSIZ];
2658         bool secondary = false;
2659         sigset_t mask, mask_chld;
2660         pid_t pid = 0;
2661
2662         log_parse_environment();
2663         log_open();
2664
2665         k = parse_argv(argc, argv);
2666         if (k < 0)
2667                 goto finish;
2668         else if (k == 0) {
2669                 r = EXIT_SUCCESS;
2670                 goto finish;
2671         }
2672
2673         if (!arg_image) {
2674                 if (arg_directory) {
2675                         char *p;
2676
2677                         p = path_make_absolute_cwd(arg_directory);
2678                         free(arg_directory);
2679                         arg_directory = p;
2680                 } else
2681                         arg_directory = get_current_dir_name();
2682
2683                 if (!arg_directory) {
2684                         log_error("Failed to determine path, please use -D.");
2685                         goto finish;
2686                 }
2687                 path_kill_slashes(arg_directory);
2688         }
2689
2690         if (!arg_machine) {
2691                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2692                 if (!arg_machine) {
2693                         log_oom();
2694                         goto finish;
2695                 }
2696
2697                 hostname_cleanup(arg_machine, false);
2698                 if (isempty(arg_machine)) {
2699                         log_error("Failed to determine machine name automatically, please use -M.");
2700                         goto finish;
2701                 }
2702         }
2703
2704         if (geteuid() != 0) {
2705                 log_error("Need to be root.");
2706                 goto finish;
2707         }
2708
2709         if (sd_booted() <= 0) {
2710                 log_error("Not running on a systemd system.");
2711                 goto finish;
2712         }
2713
2714         log_close();
2715         n_fd_passed = sd_listen_fds(false);
2716         if (n_fd_passed > 0) {
2717                 k = fdset_new_listen_fds(&fds, false);
2718                 if (k < 0) {
2719                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2720                         goto finish;
2721                 }
2722         }
2723         fdset_close_others(fds);
2724         log_open();
2725
2726         if (arg_directory) {
2727                 if (path_equal(arg_directory, "/")) {
2728                         log_error("Spawning container on root directory not supported.");
2729                         goto finish;
2730                 }
2731
2732                 if (arg_boot) {
2733                         if (path_is_os_tree(arg_directory) <= 0) {
2734                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2735                                 goto finish;
2736                         }
2737                 } else {
2738                         const char *p;
2739
2740                         p = strappenda(arg_directory,
2741                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2742                         if (access(p, F_OK) < 0) {
2743                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2744                                 goto finish;
2745
2746                         }
2747                 }
2748         } else {
2749                 char template[] = "/tmp/nspawn-root-XXXXXX";
2750
2751                 if (!mkdtemp(template)) {
2752                         log_error("Failed to create temporary directory: %m");
2753                         r = -errno;
2754                         goto finish;
2755                 }
2756
2757                 arg_directory = strdup(template);
2758                 if (!arg_directory) {
2759                         r = log_oom();
2760                         goto finish;
2761                 }
2762
2763                 image_fd = setup_image(&device_path, &loop_nr);
2764                 if (image_fd < 0) {
2765                         r = image_fd;
2766                         goto finish;
2767                 }
2768
2769                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2770                 if (r < 0)
2771                         goto finish;
2772         }
2773
2774         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2775         if (master < 0) {
2776                 log_error("Failed to acquire pseudo tty: %m");
2777                 goto finish;
2778         }
2779
2780         console = ptsname(master);
2781         if (!console) {
2782                 log_error("Failed to determine tty name: %m");
2783                 goto finish;
2784         }
2785
2786         if (!arg_quiet)
2787                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2788
2789         if (unlockpt(master) < 0) {
2790                 log_error("Failed to unlock tty: %m");
2791                 goto finish;
2792         }
2793
2794         if (access("/dev/kdbus/control", F_OK) >= 0) {
2795
2796                 if (arg_share_system) {
2797                         kdbus_domain = strdup("/dev/kdbus");
2798                         if (!kdbus_domain) {
2799                                 log_oom();
2800                                 goto finish;
2801                         }
2802                 } else {
2803                         const char *ns;
2804
2805                         ns = strappenda("machine-", arg_machine);
2806                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2807                         if (r < 0)
2808                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2809                         else
2810                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2811                 }
2812         }
2813
2814         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2815                 log_error("Failed to create kmsg socket pair: %m");
2816                 goto finish;
2817         }
2818
2819         sd_notify(0, "READY=1");
2820
2821         assert_se(sigemptyset(&mask) == 0);
2822         assert_se(sigemptyset(&mask_chld) == 0);
2823         sigaddset(&mask_chld, SIGCHLD);
2824         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2825         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2826
2827         for (;;) {
2828                 ContainerStatus container_status;
2829                 int eventfds[2] = { -1, -1 };
2830                 struct sigaction sa = {
2831                         .sa_handler = nop_handler,
2832                         .sa_flags = SA_NOCLDSTOP,
2833                 };
2834
2835                 /* Child can be killed before execv(), so handle SIGCHLD
2836                  * in order to interrupt parent's blocking calls and
2837                  * give it a chance to call wait() and terminate. */
2838                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2839                 if (r < 0) {
2840                         log_error("Failed to change the signal mask: %m");
2841                         goto finish;
2842                 }
2843
2844                 r = sigaction(SIGCHLD, &sa, NULL);
2845                 if (r < 0) {
2846                         log_error("Failed to install SIGCHLD handler: %m");
2847                         goto finish;
2848                 }
2849
2850                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2851                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2852                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2853                 if (pid < 0) {
2854                         if (errno == EINVAL)
2855                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2856                         else
2857                                 log_error("clone() failed: %m");
2858
2859                         r = pid;
2860                         goto finish;
2861                 }
2862
2863                 if (pid == 0) {
2864                         /* child */
2865                         _cleanup_free_ char *home = NULL;
2866                         unsigned n_env = 2;
2867                         const char *envp[] = {
2868                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2869                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2870                                 NULL, /* TERM */
2871                                 NULL, /* HOME */
2872                                 NULL, /* USER */
2873                                 NULL, /* LOGNAME */
2874                                 NULL, /* container_uuid */
2875                                 NULL, /* LISTEN_FDS */
2876                                 NULL, /* LISTEN_PID */
2877                                 NULL
2878                         };
2879                         char **env_use;
2880
2881                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2882                         if (envp[n_env])
2883                                 n_env ++;
2884
2885                         master = safe_close(master);
2886
2887                         close_nointr(STDIN_FILENO);
2888                         close_nointr(STDOUT_FILENO);
2889                         close_nointr(STDERR_FILENO);
2890
2891                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2892
2893                         reset_all_signal_handlers();
2894
2895                         assert_se(sigemptyset(&mask) == 0);
2896                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2897
2898                         k = open_terminal(console, O_RDWR);
2899                         if (k != STDIN_FILENO) {
2900                                 if (k >= 0) {
2901                                         safe_close(k);
2902                                         k = -EINVAL;
2903                                 }
2904
2905                                 log_error("Failed to open console: %s", strerror(-k));
2906                                 goto child_fail;
2907                         }
2908
2909                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2910                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2911                                 log_error("Failed to duplicate console: %m");
2912                                 goto child_fail;
2913                         }
2914
2915                         if (setsid() < 0) {
2916                                 log_error("setsid() failed: %m");
2917                                 goto child_fail;
2918                         }
2919
2920                         if (reset_audit_loginuid() < 0)
2921                                 goto child_fail;
2922
2923                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2924                                 log_error("PR_SET_PDEATHSIG failed: %m");
2925                                 goto child_fail;
2926                         }
2927
2928                         /* Mark everything as slave, so that we still
2929                          * receive mounts from the real root, but don't
2930                          * propagate mounts to the real root. */
2931                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2932                                 log_error("MS_SLAVE|MS_REC failed: %m");
2933                                 goto child_fail;
2934                         }
2935
2936                         if (mount_devices(arg_directory,
2937                                           root_device, root_device_rw,
2938                                           home_device, home_device_rw,
2939                                           srv_device, srv_device_rw) < 0)
2940                                 goto child_fail;
2941
2942                         /* Turn directory into bind mount */
2943                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2944                                 log_error("Failed to make bind mount.");
2945                                 goto child_fail;
2946                         }
2947
2948                         if (arg_read_only)
2949                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2950                                         log_error("Failed to make read-only.");
2951                                         goto child_fail;
2952                                 }
2953
2954                         if (mount_all(arg_directory) < 0)
2955                                 goto child_fail;
2956
2957                         if (copy_devnodes(arg_directory) < 0)
2958                                 goto child_fail;
2959
2960                         if (setup_ptmx(arg_directory) < 0)
2961                                 goto child_fail;
2962
2963                         dev_setup(arg_directory);
2964
2965                         if (audit_still_doesnt_work_in_containers() < 0)
2966                                 goto child_fail;
2967
2968                         if (setup_dev_console(arg_directory, console) < 0)
2969                                 goto child_fail;
2970
2971                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2972                                 goto child_fail;
2973
2974                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2975
2976                         if (setup_boot_id(arg_directory) < 0)
2977                                 goto child_fail;
2978
2979                         if (setup_timezone(arg_directory) < 0)
2980                                 goto child_fail;
2981
2982                         if (setup_resolv_conf(arg_directory) < 0)
2983                                 goto child_fail;
2984
2985                         if (setup_journal(arg_directory) < 0)
2986                                 goto child_fail;
2987
2988                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2989                                 goto child_fail;
2990
2991                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2992                                 goto child_fail;
2993
2994                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2995                                 goto child_fail;
2996
2997                         /* Tell the parent that we are ready, and that
2998                          * it can cgroupify us to that we lack access
2999                          * to certain devices and resources. */
3000                         r = eventfd_send_state(eventfds[1],
3001                                                EVENTFD_CHILD_SUCCEEDED);
3002                         eventfds[1] = safe_close(eventfds[1]);
3003                         if (r < 0)
3004                                 goto child_fail;
3005
3006                         if (chdir(arg_directory) < 0) {
3007                                 log_error("chdir(%s) failed: %m", arg_directory);
3008                                 goto child_fail;
3009                         }
3010
3011                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3012                                 log_error("mount(MS_MOVE) failed: %m");
3013                                 goto child_fail;
3014                         }
3015
3016                         if (chroot(".") < 0) {
3017                                 log_error("chroot() failed: %m");
3018                                 goto child_fail;
3019                         }
3020
3021                         if (chdir("/") < 0) {
3022                                 log_error("chdir() failed: %m");
3023                                 goto child_fail;
3024                         }
3025
3026                         umask(0022);
3027
3028                         if (arg_private_network)
3029                                 loopback_setup();
3030
3031                         if (drop_capabilities() < 0) {
3032                                 log_error("drop_capabilities() failed: %m");
3033                                 goto child_fail;
3034                         }
3035
3036                         r = change_uid_gid(&home);
3037                         if (r < 0)
3038                                 goto child_fail;
3039
3040                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3041                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3042                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3043                                 log_oom();
3044                                 goto child_fail;
3045                         }
3046
3047                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3048                                 char as_uuid[37];
3049
3050                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3051                                         log_oom();
3052                                         goto child_fail;
3053                                 }
3054                         }
3055
3056                         if (fdset_size(fds) > 0) {
3057                                 k = fdset_cloexec(fds, false);
3058                                 if (k < 0) {
3059                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3060                                         goto child_fail;
3061                                 }
3062
3063                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3064                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3065                                         log_oom();
3066                                         goto child_fail;
3067                                 }
3068                         }
3069
3070                         setup_hostname();
3071
3072                         if (arg_personality != 0xffffffffLU) {
3073                                 if (personality(arg_personality) < 0) {
3074                                         log_error("personality() failed: %m");
3075                                         goto child_fail;
3076                                 }
3077                         } else if (secondary) {
3078                                 if (personality(PER_LINUX32) < 0) {
3079                                         log_error("personality() failed: %m");
3080                                         goto child_fail;
3081                                 }
3082                         }
3083
3084 #ifdef HAVE_SELINUX
3085                         if (arg_selinux_context)
3086                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3087                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3088                                         goto child_fail;
3089                                 }
3090 #endif
3091
3092                         if (!strv_isempty(arg_setenv)) {
3093                                 char **n;
3094
3095                                 n = strv_env_merge(2, envp, arg_setenv);
3096                                 if (!n) {
3097                                         log_oom();
3098                                         goto child_fail;
3099                                 }
3100
3101                                 env_use = n;
3102                         } else
3103                                 env_use = (char**) envp;
3104
3105                         /* Wait until the parent is ready with the setup, too... */
3106                         r = eventfd_parent_succeeded(eventfds[0]);
3107                         eventfds[0] = safe_close(eventfds[0]);
3108                         if (r < 0)
3109                                 goto child_fail;
3110
3111                         if (arg_boot) {
3112                                 char **a;
3113                                 size_t l;
3114
3115                                 /* Automatically search for the init system */
3116
3117                                 l = 1 + argc - optind;
3118                                 a = newa(char*, l + 1);
3119                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3120
3121                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3122                                 execve(a[0], a, env_use);
3123
3124                                 a[0] = (char*) "/lib/systemd/systemd";
3125                                 execve(a[0], a, env_use);
3126
3127                                 a[0] = (char*) "/sbin/init";
3128                                 execve(a[0], a, env_use);
3129                         } else if (argc > optind)
3130                                 execvpe(argv[optind], argv + optind, env_use);
3131                         else {
3132                                 chdir(home ? home : "/root");
3133                                 execle("/bin/bash", "-bash", NULL, env_use);
3134                                 execle("/bin/sh", "-sh", NULL, env_use);
3135                         }
3136
3137                         log_error("execv() failed: %m");
3138
3139                 child_fail:
3140                         /* Tell the parent that the setup failed, so he
3141                          * can clean up resources and terminate. */
3142                         if (eventfds[1] != -1)
3143                                 eventfd_send_state(eventfds[1],
3144                                                    EVENTFD_CHILD_FAILED);
3145                         _exit(EXIT_FAILURE);
3146                 }
3147
3148                 fdset_free(fds);
3149                 fds = NULL;
3150
3151                 /* Wait for the child event:
3152                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3153                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3154                  * it is ready with all it needs to do with priviliges.
3155                  * After we got the notification we can make the process
3156                  * join its cgroup which might limit what it can do */
3157                 r = eventfd_child_succeeded(eventfds[1]);
3158                 eventfds[1] = safe_close(eventfds[1]);
3159                 if (r < 0)
3160                         goto check_container_status;
3161
3162                 r = register_machine(pid);
3163                 if (r < 0)
3164                         goto finish;
3165
3166                 r = move_network_interfaces(pid);
3167                 if (r < 0)
3168                         goto finish;
3169
3170                 r = setup_veth(pid, veth_name);
3171                 if (r < 0)
3172                         goto finish;
3173
3174                 r = setup_bridge(veth_name);
3175                 if (r < 0)
3176                         goto finish;
3177
3178                 r = setup_macvlan(pid);
3179                 if (r < 0)
3180                         goto finish;
3181
3182                 /* Block SIGCHLD here, before notifying child.
3183                  * process_pty() will handle it with the other signals. */
3184                 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3185                 if (r < 0)
3186                         goto finish;
3187
3188                 /* Reset signal to default */
3189                 r = default_signals(SIGCHLD, -1);
3190                 if (r < 0)
3191                         goto finish;
3192
3193                 /* Notify the child that the parent is ready with all
3194                  * its setup, and that the child can now hand over
3195                  * control to the code to run inside the container. */
3196                 r = eventfd_send_state(eventfds[0],
3197                                        EVENTFD_PARENT_SUCCEEDED);
3198                 eventfds[0] = safe_close(eventfds[0]);
3199                 if (r < 0)
3200                         goto finish;
3201
3202                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3203                 if (k < 0) {
3204                         r = EXIT_FAILURE;
3205                         break;
3206                 }
3207
3208                 if (!arg_quiet)
3209                         putc('\n', stdout);
3210
3211                 /* Kill if it is not dead yet anyway */
3212                 terminate_machine(pid);
3213
3214 check_container_status:
3215                 /* Redundant, but better safe than sorry */
3216                 kill(pid, SIGKILL);
3217
3218                 r = wait_for_container(pid, &container_status);
3219                 pid = 0;
3220
3221                 if (r < 0) {
3222                         r = EXIT_FAILURE;
3223                         break;
3224                 } else if (container_status == CONTAINER_TERMINATED)
3225                         break;
3226
3227                 /* CONTAINER_REBOOTED, loop again */
3228         }
3229
3230 finish:
3231         loop_remove(loop_nr, &image_fd);
3232
3233         if (pid > 0)
3234                 kill(pid, SIGKILL);
3235
3236         free(arg_directory);
3237         free(arg_machine);
3238         free(arg_user);
3239         strv_free(arg_setenv);
3240         strv_free(arg_network_interfaces);
3241         strv_free(arg_network_macvlan);
3242         strv_free(arg_bind);
3243         strv_free(arg_bind_ro);
3244
3245         return r;
3246 }