chiark / gitweb /
nspawn: UP the host side of the veth pair after adding it to a bridge
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94
95 typedef enum LinkJournal {
96         LINK_NO,
97         LINK_AUTO,
98         LINK_HOST,
99         LINK_GUEST
100 } LinkJournal;
101
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114         (1ULL << CAP_CHOWN) |
115         (1ULL << CAP_DAC_OVERRIDE) |
116         (1ULL << CAP_DAC_READ_SEARCH) |
117         (1ULL << CAP_FOWNER) |
118         (1ULL << CAP_FSETID) |
119         (1ULL << CAP_IPC_OWNER) |
120         (1ULL << CAP_KILL) |
121         (1ULL << CAP_LEASE) |
122         (1ULL << CAP_LINUX_IMMUTABLE) |
123         (1ULL << CAP_NET_BIND_SERVICE) |
124         (1ULL << CAP_NET_BROADCAST) |
125         (1ULL << CAP_NET_RAW) |
126         (1ULL << CAP_SETGID) |
127         (1ULL << CAP_SETFCAP) |
128         (1ULL << CAP_SETPCAP) |
129         (1ULL << CAP_SETUID) |
130         (1ULL << CAP_SYS_ADMIN) |
131         (1ULL << CAP_SYS_CHROOT) |
132         (1ULL << CAP_SYS_NICE) |
133         (1ULL << CAP_SYS_PTRACE) |
134         (1ULL << CAP_SYS_TTY_CONFIG) |
135         (1ULL << CAP_SYS_RESOURCE) |
136         (1ULL << CAP_SYS_BOOT) |
137         (1ULL << CAP_AUDIT_WRITE) |
138         (1ULL << CAP_AUDIT_CONTROL) |
139         (1ULL << CAP_MKNOD);
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
153
154 static int help(void) {
155
156         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158                "  -h --help                 Show this help\n"
159                "     --version              Print version string\n"
160                "  -q --quiet                Do not show status information\n"
161                "  -D --directory=PATH       Root directory for the container\n"
162                "  -i --image=PATH           File system device or image for the container\n"
163                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
164                "  -u --user=USER            Run the command under specified user or uid\n"
165                "  -M --machine=NAME         Set the machine name for the container\n"
166                "     --uuid=UUID            Set a specific machine UUID for the container\n"
167                "  -S --slice=SLICE          Place the container in the specified slice\n"
168                "     --private-network      Disable network in container\n"
169                "     --network-interface=INTERFACE\n"
170                "                            Assign an existing network interface to the\n"
171                "                            container\n"
172                "     --network-macvlan=INTERFACE\n"
173                "                            Create a macvlan network interface based on an\n"
174                "                            existing network interface to the container\n"
175                "     --network-veth         Add a virtual ethernet connection between host\n"
176                "                            and container\n"
177                "     --network-bridge=INTERFACE\n"
178                "                            Add a virtual ethernet connection between host\n"
179                "                            and container and add it to an existing bridge on\n"
180                "                            the host\n"
181                "  -Z --selinux-context=SECLABEL\n"
182                "                            Set the SELinux security context to be used by\n"
183                "                            processes in the container\n"
184                "  -L --selinux-apifs-context=SECLABEL\n"
185                "                            Set the SELinux security context to be used by\n"
186                "                            API/tmpfs file systems in the container\n"
187                "     --capability=CAP       In addition to the default, retain specified\n"
188                "                            capability\n"
189                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
190                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
191                "  -j                        Equivalent to --link-journal=host\n"
192                "     --read-only            Mount the root directory read-only\n"
193                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
194                "                            the container\n"
195                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
196                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
197                "     --share-system         Share system namespaces with host\n"
198                "     --register=BOOLEAN     Register container as machine\n"
199                "     --keep-unit            Do not register a scope for the machine, reuse\n"
200                "                            the service unit nspawn is running in\n",
201                program_invocation_short_name);
202
203         return 0;
204 }
205
206 static int parse_argv(int argc, char *argv[]) {
207
208         enum {
209                 ARG_VERSION = 0x100,
210                 ARG_PRIVATE_NETWORK,
211                 ARG_UUID,
212                 ARG_READ_ONLY,
213                 ARG_CAPABILITY,
214                 ARG_DROP_CAPABILITY,
215                 ARG_LINK_JOURNAL,
216                 ARG_BIND,
217                 ARG_BIND_RO,
218                 ARG_SETENV,
219                 ARG_SHARE_SYSTEM,
220                 ARG_REGISTER,
221                 ARG_KEEP_UNIT,
222                 ARG_NETWORK_INTERFACE,
223                 ARG_NETWORK_MACVLAN,
224                 ARG_NETWORK_VETH,
225                 ARG_NETWORK_BRIDGE,
226                 ARG_PERSONALITY,
227         };
228
229         static const struct option options[] = {
230                 { "help",                  no_argument,       NULL, 'h'                   },
231                 { "version",               no_argument,       NULL, ARG_VERSION           },
232                 { "directory",             required_argument, NULL, 'D'                   },
233                 { "user",                  required_argument, NULL, 'u'                   },
234                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
235                 { "boot",                  no_argument,       NULL, 'b'                   },
236                 { "uuid",                  required_argument, NULL, ARG_UUID              },
237                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
238                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
239                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
240                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
241                 { "bind",                  required_argument, NULL, ARG_BIND              },
242                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
243                 { "machine",               required_argument, NULL, 'M'                   },
244                 { "slice",                 required_argument, NULL, 'S'                   },
245                 { "setenv",                required_argument, NULL, ARG_SETENV            },
246                 { "selinux-context",       required_argument, NULL, 'Z'                   },
247                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
248                 { "quiet",                 no_argument,       NULL, 'q'                   },
249                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
250                 { "register",              required_argument, NULL, ARG_REGISTER          },
251                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
252                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
253                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
254                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
255                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
256                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
257                 { "image",                 required_argument, NULL, 'i'                   },
258                 {}
259         };
260
261         int c, r;
262         uint64_t plus = 0, minus = 0;
263
264         assert(argc >= 0);
265         assert(argv);
266
267         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
268
269                 switch (c) {
270
271                 case 'h':
272                         return help();
273
274                 case ARG_VERSION:
275                         puts(PACKAGE_STRING);
276                         puts(SYSTEMD_FEATURES);
277                         return 0;
278
279                 case 'D':
280                         free(arg_directory);
281                         arg_directory = canonicalize_file_name(optarg);
282                         if (!arg_directory) {
283                                 log_error("Invalid root directory: %m");
284                                 return -ENOMEM;
285                         }
286
287                         break;
288
289                 case 'i':
290                         arg_image = optarg;
291                         break;
292
293                 case 'u':
294                         free(arg_user);
295                         arg_user = strdup(optarg);
296                         if (!arg_user)
297                                 return log_oom();
298
299                         break;
300
301                 case ARG_NETWORK_BRIDGE:
302                         arg_network_bridge = optarg;
303
304                         /* fall through */
305
306                 case ARG_NETWORK_VETH:
307                         arg_network_veth = true;
308                         arg_private_network = true;
309                         break;
310
311                 case ARG_NETWORK_INTERFACE:
312                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
313                                 return log_oom();
314
315                         arg_private_network = true;
316                         break;
317
318                 case ARG_NETWORK_MACVLAN:
319                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
320                                 return log_oom();
321
322                         /* fall through */
323
324                 case ARG_PRIVATE_NETWORK:
325                         arg_private_network = true;
326                         break;
327
328                 case 'b':
329                         arg_boot = true;
330                         break;
331
332                 case ARG_UUID:
333                         r = sd_id128_from_string(optarg, &arg_uuid);
334                         if (r < 0) {
335                                 log_error("Invalid UUID: %s", optarg);
336                                 return r;
337                         }
338                         break;
339
340                 case 'S':
341                         arg_slice = optarg;
342                         break;
343
344                 case 'M':
345                         if (isempty(optarg)) {
346                                 free(arg_machine);
347                                 arg_machine = NULL;
348                         } else {
349
350                                 if (!hostname_is_valid(optarg)) {
351                                         log_error("Invalid machine name: %s", optarg);
352                                         return -EINVAL;
353                                 }
354
355                                 free(arg_machine);
356                                 arg_machine = strdup(optarg);
357                                 if (!arg_machine)
358                                         return log_oom();
359
360                                 break;
361                         }
362
363                 case 'Z':
364                         arg_selinux_context = optarg;
365                         break;
366
367                 case 'L':
368                         arg_selinux_apifs_context = optarg;
369                         break;
370
371                 case ARG_READ_ONLY:
372                         arg_read_only = true;
373                         break;
374
375                 case ARG_CAPABILITY:
376                 case ARG_DROP_CAPABILITY: {
377                         char *state, *word;
378                         size_t length;
379
380                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381                                 _cleanup_free_ char *t;
382                                 cap_value_t cap;
383
384                                 t = strndup(word, length);
385                                 if (!t)
386                                         return log_oom();
387
388                                 if (streq(t, "all")) {
389                                         if (c == ARG_CAPABILITY)
390                                                 plus = (uint64_t) -1;
391                                         else
392                                                 minus = (uint64_t) -1;
393                                 } else {
394                                         if (cap_from_name(t, &cap) < 0) {
395                                                 log_error("Failed to parse capability %s.", t);
396                                                 return -EINVAL;
397                                         }
398
399                                         if (c == ARG_CAPABILITY)
400                                                 plus |= 1ULL << (uint64_t) cap;
401                                         else
402                                                 minus |= 1ULL << (uint64_t) cap;
403                                 }
404                         }
405
406                         break;
407                 }
408
409                 case 'j':
410                         arg_link_journal = LINK_GUEST;
411                         break;
412
413                 case ARG_LINK_JOURNAL:
414                         if (streq(optarg, "auto"))
415                                 arg_link_journal = LINK_AUTO;
416                         else if (streq(optarg, "no"))
417                                 arg_link_journal = LINK_NO;
418                         else if (streq(optarg, "guest"))
419                                 arg_link_journal = LINK_GUEST;
420                         else if (streq(optarg, "host"))
421                                 arg_link_journal = LINK_HOST;
422                         else {
423                                 log_error("Failed to parse link journal mode %s", optarg);
424                                 return -EINVAL;
425                         }
426
427                         break;
428
429                 case ARG_BIND:
430                 case ARG_BIND_RO: {
431                         _cleanup_free_ char *a = NULL, *b = NULL;
432                         char *e;
433                         char ***x;
434
435                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436
437                         e = strchr(optarg, ':');
438                         if (e) {
439                                 a = strndup(optarg, e - optarg);
440                                 b = strdup(e + 1);
441                         } else {
442                                 a = strdup(optarg);
443                                 b = strdup(optarg);
444                         }
445
446                         if (!a || !b)
447                                 return log_oom();
448
449                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
450                                 log_error("Invalid bind mount specification: %s", optarg);
451                                 return -EINVAL;
452                         }
453
454                         r = strv_extend(x, a);
455                         if (r < 0)
456                                 return log_oom();
457
458                         r = strv_extend(x, b);
459                         if (r < 0)
460                                 return log_oom();
461
462                         break;
463                 }
464
465                 case ARG_SETENV: {
466                         char **n;
467
468                         if (!env_assignment_is_valid(optarg)) {
469                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
470                                 return -EINVAL;
471                         }
472
473                         n = strv_env_set(arg_setenv, optarg);
474                         if (!n)
475                                 return log_oom();
476
477                         strv_free(arg_setenv);
478                         arg_setenv = n;
479                         break;
480                 }
481
482                 case 'q':
483                         arg_quiet = true;
484                         break;
485
486                 case ARG_SHARE_SYSTEM:
487                         arg_share_system = true;
488                         break;
489
490                 case ARG_REGISTER:
491                         r = parse_boolean(optarg);
492                         if (r < 0) {
493                                 log_error("Failed to parse --register= argument: %s", optarg);
494                                 return r;
495                         }
496
497                         arg_register = r;
498                         break;
499
500                 case ARG_KEEP_UNIT:
501                         arg_keep_unit = true;
502                         break;
503
504                 case ARG_PERSONALITY:
505
506                         arg_personality = personality_from_string(optarg);
507                         if (arg_personality == 0xffffffffLU) {
508                                 log_error("Unknown or unsupported personality '%s'.", optarg);
509                                 return -EINVAL;
510                         }
511
512                         break;
513
514                 case '?':
515                         return -EINVAL;
516
517                 default:
518                         assert_not_reached("Unhandled option");
519                 }
520         }
521
522         if (arg_share_system)
523                 arg_register = false;
524
525         if (arg_boot && arg_share_system) {
526                 log_error("--boot and --share-system may not be combined.");
527                 return -EINVAL;
528         }
529
530         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531                 log_error("--keep-unit may not be used when invoked from a user session.");
532                 return -EINVAL;
533         }
534
535         if (arg_directory && arg_image) {
536                 log_error("--directory= and --image= may not be combined.");
537                 return -EINVAL;
538         }
539
540         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
541
542         return 1;
543 }
544
545 static int mount_all(const char *dest) {
546
547         typedef struct MountPoint {
548                 const char *what;
549                 const char *where;
550                 const char *type;
551                 const char *options;
552                 unsigned long flags;
553                 bool fatal;
554         } MountPoint;
555
556         static const MountPoint mount_table[] = {
557                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
558                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
559                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
560                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
561                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
562                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
565 #ifdef HAVE_SELINUX
566                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
567                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
568 #endif
569         };
570
571         unsigned k;
572         int r = 0;
573
574         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575                 _cleanup_free_ char *where = NULL;
576 #ifdef HAVE_SELINUX
577                 _cleanup_free_ char *options = NULL;
578 #endif
579                 const char *o;
580                 int t;
581
582                 where = strjoin(dest, "/", mount_table[k].where, NULL);
583                 if (!where)
584                         return log_oom();
585
586                 t = path_is_mount_point(where, true);
587                 if (t < 0) {
588                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
589
590                         if (r == 0)
591                                 r = t;
592
593                         continue;
594                 }
595
596                 /* Skip this entry if it is not a remount. */
597                 if (mount_table[k].what && t > 0)
598                         continue;
599
600                 mkdir_p(where, 0755);
601
602 #ifdef HAVE_SELINUX
603                 if (arg_selinux_apifs_context &&
604                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
606                         if (!options)
607                                 return log_oom();
608
609                         o = options;
610                 } else
611 #endif
612                         o = mount_table[k].options;
613
614
615                 if (mount(mount_table[k].what,
616                           where,
617                           mount_table[k].type,
618                           mount_table[k].flags,
619                           o) < 0 &&
620                     mount_table[k].fatal) {
621
622                         log_error("mount(%s) failed: %m", where);
623
624                         if (r == 0)
625                                 r = -errno;
626                 }
627         }
628
629         return r;
630 }
631
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
633         char **x, **y;
634
635         STRV_FOREACH_PAIR(x, y, l) {
636                 char *where;
637                 struct stat source_st, dest_st;
638                 int r;
639
640                 if (stat(*x, &source_st) < 0) {
641                         log_error("Failed to stat %s: %m", *x);
642                         return -errno;
643                 }
644
645                 where = strappenda(dest, *y);
646                 r = stat(where, &dest_st);
647                 if (r == 0) {
648                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
650                                                 *x, where);
651                                 return -EINVAL;
652                         }
653                 } else if (errno == ENOENT) {
654                         r = mkdir_parents_label(where, 0755);
655                         if (r < 0) {
656                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
657                                 return r;
658                         }
659                 } else {
660                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
661                         return -errno;
662                 }
663                 /* Create the mount point, but be conservative -- refuse to create block
664                 * and char devices. */
665                 if (S_ISDIR(source_st.st_mode))
666                         mkdir_label(where, 0755);
667                 else if (S_ISFIFO(source_st.st_mode))
668                         mkfifo(where, 0644);
669                 else if (S_ISSOCK(source_st.st_mode))
670                         mknod(where, 0644 | S_IFSOCK, 0);
671                 else if (S_ISREG(source_st.st_mode))
672                         touch(where);
673                 else {
674                         log_error("Refusing to create mountpoint for file: %s", *x);
675                         return -ENOTSUP;
676                 }
677
678                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679                         log_error("mount(%s) failed: %m", where);
680                         return -errno;
681                 }
682
683                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684                         log_error("mount(%s) failed: %m", where);
685                         return -errno;
686                 }
687         }
688
689         return 0;
690 }
691
692 static int setup_timezone(const char *dest) {
693         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
694         char *z, *y;
695         int r;
696
697         assert(dest);
698
699         /* Fix the timezone, if possible */
700         r = readlink_malloc("/etc/localtime", &p);
701         if (r < 0) {
702                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
703                 return 0;
704         }
705
706         z = path_startswith(p, "../usr/share/zoneinfo/");
707         if (!z)
708                 z = path_startswith(p, "/usr/share/zoneinfo/");
709         if (!z) {
710                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
711                 return 0;
712         }
713
714         where = strappend(dest, "/etc/localtime");
715         if (!where)
716                 return log_oom();
717
718         r = readlink_malloc(where, &q);
719         if (r >= 0) {
720                 y = path_startswith(q, "../usr/share/zoneinfo/");
721                 if (!y)
722                         y = path_startswith(q, "/usr/share/zoneinfo/");
723
724
725                 /* Already pointing to the right place? Then do nothing .. */
726                 if (y && streq(y, z))
727                         return 0;
728         }
729
730         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
731         if (!check)
732                 return log_oom();
733
734         if (access(check, F_OK) < 0) {
735                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
736                 return 0;
737         }
738
739         what = strappend("../usr/share/zoneinfo/", z);
740         if (!what)
741                 return log_oom();
742
743         unlink(where);
744         if (symlink(what, where) < 0) {
745                 log_error("Failed to correct timezone of container: %m");
746                 return 0;
747         }
748
749         return 0;
750 }
751
752 static int setup_resolv_conf(const char *dest) {
753         char _cleanup_free_ *where = NULL;
754
755         assert(dest);
756
757         if (arg_private_network)
758                 return 0;
759
760         /* Fix resolv.conf, if possible */
761         where = strappend(dest, "/etc/resolv.conf");
762         if (!where)
763                 return log_oom();
764
765         /* We don't really care for the results of this really. If it
766          * fails, it fails, but meh... */
767         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
768
769         return 0;
770 }
771
772 static int setup_boot_id(const char *dest) {
773         _cleanup_free_ char *from = NULL, *to = NULL;
774         sd_id128_t rnd = {};
775         char as_uuid[37];
776         int r;
777
778         assert(dest);
779
780         if (arg_share_system)
781                 return 0;
782
783         /* Generate a new randomized boot ID, so that each boot-up of
784          * the container gets a new one */
785
786         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
787         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
788         if (!from || !to)
789                 return log_oom();
790
791         r = sd_id128_randomize(&rnd);
792         if (r < 0) {
793                 log_error("Failed to generate random boot id: %s", strerror(-r));
794                 return r;
795         }
796
797         snprintf(as_uuid, sizeof(as_uuid),
798                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
799                  SD_ID128_FORMAT_VAL(rnd));
800         char_array_0(as_uuid);
801
802         r = write_string_file(from, as_uuid);
803         if (r < 0) {
804                 log_error("Failed to write boot id: %s", strerror(-r));
805                 return r;
806         }
807
808         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
809                 log_error("Failed to bind mount boot id: %m");
810                 r = -errno;
811         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
812                 log_warning("Failed to make boot id read-only: %m");
813
814         unlink(from);
815         return r;
816 }
817
818 static int copy_devnodes(const char *dest) {
819
820         static const char devnodes[] =
821                 "null\0"
822                 "zero\0"
823                 "full\0"
824                 "random\0"
825                 "urandom\0"
826                 "tty\0";
827
828         const char *d;
829         int r = 0;
830         _cleanup_umask_ mode_t u;
831
832         assert(dest);
833
834         u = umask(0000);
835
836         NULSTR_FOREACH(d, devnodes) {
837                 _cleanup_free_ char *from = NULL, *to = NULL;
838                 struct stat st;
839
840                 from = strappend("/dev/", d);
841                 to = strjoin(dest, "/dev/", d, NULL);
842                 if (!from || !to)
843                         return log_oom();
844
845                 if (stat(from, &st) < 0) {
846
847                         if (errno != ENOENT) {
848                                 log_error("Failed to stat %s: %m", from);
849                                 return -errno;
850                         }
851
852                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
853
854                         log_error("%s is not a char or block device, cannot copy", from);
855                         return -EIO;
856
857                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
858
859                         log_error("mknod(%s) failed: %m", dest);
860                         return  -errno;
861                 }
862         }
863
864         return r;
865 }
866
867 static int setup_ptmx(const char *dest) {
868         _cleanup_free_ char *p = NULL;
869
870         p = strappend(dest, "/dev/ptmx");
871         if (!p)
872                 return log_oom();
873
874         if (symlink("pts/ptmx", p) < 0) {
875                 log_error("Failed to create /dev/ptmx symlink: %m");
876                 return -errno;
877         }
878
879         return 0;
880 }
881
882 static int setup_dev_console(const char *dest, const char *console) {
883         _cleanup_umask_ mode_t u;
884         const char *to;
885         struct stat st;
886         int r;
887
888         assert(dest);
889         assert(console);
890
891         u = umask(0000);
892
893         if (stat("/dev/null", &st) < 0) {
894                 log_error("Failed to stat /dev/null: %m");
895                 return -errno;
896         }
897
898         r = chmod_and_chown(console, 0600, 0, 0);
899         if (r < 0) {
900                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
901                 return r;
902         }
903
904         /* We need to bind mount the right tty to /dev/console since
905          * ptys can only exist on pts file systems. To have something
906          * to bind mount things on we create a device node first, and
907          * use /dev/null for that since we the cgroups device policy
908          * allows us to create that freely, while we cannot create
909          * /dev/console. (Note that the major minor doesn't actually
910          * matter here, since we mount it over anyway). */
911
912         to = strappenda(dest, "/dev/console");
913         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
914                 log_error("mknod() for /dev/console failed: %m");
915                 return -errno;
916         }
917
918         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
919                 log_error("Bind mount for /dev/console failed: %m");
920                 return -errno;
921         }
922
923         return 0;
924 }
925
926 static int setup_kmsg(const char *dest, int kmsg_socket) {
927         _cleanup_free_ char *from = NULL, *to = NULL;
928         int r, fd, k;
929         _cleanup_umask_ mode_t u;
930         union {
931                 struct cmsghdr cmsghdr;
932                 uint8_t buf[CMSG_SPACE(sizeof(int))];
933         } control = {};
934         struct msghdr mh = {
935                 .msg_control = &control,
936                 .msg_controllen = sizeof(control),
937         };
938         struct cmsghdr *cmsg;
939
940         assert(dest);
941         assert(kmsg_socket >= 0);
942
943         u = umask(0000);
944
945         /* We create the kmsg FIFO as /dev/kmsg, but immediately
946          * delete it after bind mounting it to /proc/kmsg. While FIFOs
947          * on the reading side behave very similar to /proc/kmsg,
948          * their writing side behaves differently from /dev/kmsg in
949          * that writing blocks when nothing is reading. In order to
950          * avoid any problems with containers deadlocking due to this
951          * we simply make /dev/kmsg unavailable to the container. */
952         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
953             asprintf(&to, "%s/proc/kmsg", dest) < 0)
954                 return log_oom();
955
956         if (mkfifo(from, 0600) < 0) {
957                 log_error("mkfifo() for /dev/kmsg failed: %m");
958                 return -errno;
959         }
960
961         r = chmod_and_chown(from, 0600, 0, 0);
962         if (r < 0) {
963                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
964                 return r;
965         }
966
967         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
968                 log_error("Bind mount for /proc/kmsg failed: %m");
969                 return -errno;
970         }
971
972         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
973         if (fd < 0) {
974                 log_error("Failed to open fifo: %m");
975                 return -errno;
976         }
977
978         cmsg = CMSG_FIRSTHDR(&mh);
979         cmsg->cmsg_level = SOL_SOCKET;
980         cmsg->cmsg_type = SCM_RIGHTS;
981         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
982         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
983
984         mh.msg_controllen = cmsg->cmsg_len;
985
986         /* Store away the fd in the socket, so that it stays open as
987          * long as we run the child */
988         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
989         close_nointr_nofail(fd);
990
991         if (k < 0) {
992                 log_error("Failed to send FIFO fd: %m");
993                 return -errno;
994         }
995
996         /* And now make the FIFO unavailable as /dev/kmsg... */
997         unlink(from);
998         return 0;
999 }
1000
1001 static int setup_hostname(void) {
1002
1003         if (arg_share_system)
1004                 return 0;
1005
1006         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1007                 return -errno;
1008
1009         return 0;
1010 }
1011
1012 static int setup_journal(const char *directory) {
1013         sd_id128_t machine_id, this_id;
1014         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1015         char *id;
1016         int r;
1017
1018         p = strappend(directory, "/etc/machine-id");
1019         if (!p)
1020                 return log_oom();
1021
1022         r = read_one_line_file(p, &b);
1023         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1024                 return 0;
1025         else if (r < 0) {
1026                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1027                 return r;
1028         }
1029
1030         id = strstrip(b);
1031         if (isempty(id) && arg_link_journal == LINK_AUTO)
1032                 return 0;
1033
1034         /* Verify validity */
1035         r = sd_id128_from_string(id, &machine_id);
1036         if (r < 0) {
1037                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1038                 return r;
1039         }
1040
1041         r = sd_id128_get_machine(&this_id);
1042         if (r < 0) {
1043                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1044                 return r;
1045         }
1046
1047         if (sd_id128_equal(machine_id, this_id)) {
1048                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1049                          "Host and machine ids are equal (%s): refusing to link journals", id);
1050                 if (arg_link_journal == LINK_AUTO)
1051                         return 0;
1052                 return
1053                         -EEXIST;
1054         }
1055
1056         if (arg_link_journal == LINK_NO)
1057                 return 0;
1058
1059         free(p);
1060         p = strappend("/var/log/journal/", id);
1061         q = strjoin(directory, "/var/log/journal/", id, NULL);
1062         if (!p || !q)
1063                 return log_oom();
1064
1065         if (path_is_mount_point(p, false) > 0) {
1066                 if (arg_link_journal != LINK_AUTO) {
1067                         log_error("%s: already a mount point, refusing to use for journal", p);
1068                         return -EEXIST;
1069                 }
1070
1071                 return 0;
1072         }
1073
1074         if (path_is_mount_point(q, false) > 0) {
1075                 if (arg_link_journal != LINK_AUTO) {
1076                         log_error("%s: already a mount point, refusing to use for journal", q);
1077                         return -EEXIST;
1078                 }
1079
1080                 return 0;
1081         }
1082
1083         r = readlink_and_make_absolute(p, &d);
1084         if (r >= 0) {
1085                 if ((arg_link_journal == LINK_GUEST ||
1086                      arg_link_journal == LINK_AUTO) &&
1087                     path_equal(d, q)) {
1088
1089                         r = mkdir_p(q, 0755);
1090                         if (r < 0)
1091                                 log_warning("failed to create directory %s: %m", q);
1092                         return 0;
1093                 }
1094
1095                 if (unlink(p) < 0) {
1096                         log_error("Failed to remove symlink %s: %m", p);
1097                         return -errno;
1098                 }
1099         } else if (r == -EINVAL) {
1100
1101                 if (arg_link_journal == LINK_GUEST &&
1102                     rmdir(p) < 0) {
1103
1104                         if (errno == ENOTDIR) {
1105                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1106                                 return r;
1107                         } else {
1108                                 log_error("Failed to remove %s: %m", p);
1109                                 return -errno;
1110                         }
1111                 }
1112         } else if (r != -ENOENT) {
1113                 log_error("readlink(%s) failed: %m", p);
1114                 return r;
1115         }
1116
1117         if (arg_link_journal == LINK_GUEST) {
1118
1119                 if (symlink(q, p) < 0) {
1120                         log_error("Failed to symlink %s to %s: %m", q, p);
1121                         return -errno;
1122                 }
1123
1124                 r = mkdir_p(q, 0755);
1125                 if (r < 0)
1126                         log_warning("failed to create directory %s: %m", q);
1127                 return 0;
1128         }
1129
1130         if (arg_link_journal == LINK_HOST) {
1131                 r = mkdir_p(p, 0755);
1132                 if (r < 0) {
1133                         log_error("Failed to create %s: %m", p);
1134                         return r;
1135                 }
1136
1137         } else if (access(p, F_OK) < 0)
1138                 return 0;
1139
1140         if (dir_is_empty(q) == 0) {
1141                 log_error("%s not empty.", q);
1142                 return -ENOTEMPTY;
1143         }
1144
1145         r = mkdir_p(q, 0755);
1146         if (r < 0) {
1147                 log_error("Failed to create %s: %m", q);
1148                 return r;
1149         }
1150
1151         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1152                 log_error("Failed to bind mount journal from host into guest: %m");
1153                 return -errno;
1154         }
1155
1156         return 0;
1157 }
1158
1159 static int setup_kdbus(const char *dest, const char *path) {
1160         const char *p;
1161
1162         if (!path)
1163                 return 0;
1164
1165         p = strappenda(dest, "/dev/kdbus");
1166         if (mkdir(p, 0755) < 0) {
1167                 log_error("Failed to create kdbus path: %m");
1168                 return  -errno;
1169         }
1170
1171         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1172                 log_error("Failed to mount kdbus domain path: %m");
1173                 return -errno;
1174         }
1175
1176         return 0;
1177 }
1178
1179 static int drop_capabilities(void) {
1180         return capability_bounding_set_drop(~arg_retain, false);
1181 }
1182
1183 static int register_machine(pid_t pid) {
1184         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1185         _cleanup_bus_unref_ sd_bus *bus = NULL;
1186         int r;
1187
1188         if (!arg_register)
1189                 return 0;
1190
1191         r = sd_bus_default_system(&bus);
1192         if (r < 0) {
1193                 log_error("Failed to open system bus: %s", strerror(-r));
1194                 return r;
1195         }
1196
1197         if (arg_keep_unit) {
1198                 r = sd_bus_call_method(
1199                                 bus,
1200                                 "org.freedesktop.machine1",
1201                                 "/org/freedesktop/machine1",
1202                                 "org.freedesktop.machine1.Manager",
1203                                 "RegisterMachine",
1204                                 &error,
1205                                 NULL,
1206                                 "sayssus",
1207                                 arg_machine,
1208                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1209                                 "nspawn",
1210                                 "container",
1211                                 (uint32_t) pid,
1212                                 strempty(arg_directory));
1213         } else {
1214                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1215
1216                 r = sd_bus_message_new_method_call(
1217                                 bus,
1218                                 &m,
1219                                 "org.freedesktop.machine1",
1220                                 "/org/freedesktop/machine1",
1221                                 "org.freedesktop.machine1.Manager",
1222                                 "CreateMachine");
1223                 if (r < 0) {
1224                         log_error("Failed to create message: %s", strerror(-r));
1225                         return r;
1226                 }
1227
1228                 r = sd_bus_message_append(
1229                                 m,
1230                                 "sayssus",
1231                                 arg_machine,
1232                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1233                                 "nspawn",
1234                                 "container",
1235                                 (uint32_t) pid,
1236                                 strempty(arg_directory));
1237                 if (r < 0) {
1238                         log_error("Failed to append message arguments: %s", strerror(-r));
1239                         return r;
1240                 }
1241
1242                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1243                 if (r < 0) {
1244                         log_error("Failed to open container: %s", strerror(-r));
1245                         return r;
1246                 }
1247
1248                 if (!isempty(arg_slice)) {
1249                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1250                         if (r < 0) {
1251                                 log_error("Failed to append slice: %s", strerror(-r));
1252                                 return r;
1253                         }
1254                 }
1255
1256                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1257                 if (r < 0) {
1258                         log_error("Failed to add device policy: %s", strerror(-r));
1259                         return r;
1260                 }
1261
1262                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1263                                           /* Allow the container to
1264                                            * access and create the API
1265                                            * device nodes, so that
1266                                            * PrivateDevices= in the
1267                                            * container can work
1268                                            * fine */
1269                                           "/dev/null", "rwm",
1270                                           "/dev/zero", "rwm",
1271                                           "/dev/full", "rwm",
1272                                           "/dev/random", "rwm",
1273                                           "/dev/urandom", "rwm",
1274                                           "/dev/tty", "rwm",
1275                                           /* Allow the container
1276                                            * access to ptys. However,
1277                                            * do not permit the
1278                                            * container to ever create
1279                                            * these device nodes. */
1280                                           "/dev/pts/ptmx", "rw",
1281                                           "char-pts", "rw",
1282                                           /* Allow the container
1283                                            * access to all kdbus
1284                                            * devices. Again, the
1285                                            * container cannot create
1286                                            * these nodes, only use
1287                                            * them. We use a pretty
1288                                            * open match here, so that
1289                                            * the kernel API can still
1290                                            * change. */
1291                                           "char-kdbus", "rw",
1292                                           "char-kdbus/*", "rw");
1293                 if (r < 0) {
1294                         log_error("Failed to add device whitelist: %s", strerror(-r));
1295                         return r;
1296                 }
1297
1298                 r = sd_bus_message_close_container(m);
1299                 if (r < 0) {
1300                         log_error("Failed to close container: %s", strerror(-r));
1301                         return r;
1302                 }
1303
1304                 r = sd_bus_call(bus, m, 0, &error, NULL);
1305         }
1306
1307         if (r < 0) {
1308                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1309                 return r;
1310         }
1311
1312         return 0;
1313 }
1314
1315 static int terminate_machine(pid_t pid) {
1316         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1317         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1318         _cleanup_bus_unref_ sd_bus *bus = NULL;
1319         const char *path;
1320         int r;
1321
1322         if (!arg_register)
1323                 return 0;
1324
1325         r = sd_bus_default_system(&bus);
1326         if (r < 0) {
1327                 log_error("Failed to open system bus: %s", strerror(-r));
1328                 return r;
1329         }
1330
1331         r = sd_bus_call_method(
1332                         bus,
1333                         "org.freedesktop.machine1",
1334                         "/org/freedesktop/machine1",
1335                         "org.freedesktop.machine1.Manager",
1336                         "GetMachineByPID",
1337                         &error,
1338                         &reply,
1339                         "u",
1340                         (uint32_t) pid);
1341         if (r < 0) {
1342                 /* Note that the machine might already have been
1343                  * cleaned up automatically, hence don't consider it a
1344                  * failure if we cannot get the machine object. */
1345                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1346                 return 0;
1347         }
1348
1349         r = sd_bus_message_read(reply, "o", &path);
1350         if (r < 0)
1351                 return bus_log_parse_error(r);
1352
1353         r = sd_bus_call_method(
1354                         bus,
1355                         "org.freedesktop.machine1",
1356                         path,
1357                         "org.freedesktop.machine1.Machine",
1358                         "Terminate",
1359                         &error,
1360                         NULL,
1361                         NULL);
1362         if (r < 0) {
1363                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1364                 return 0;
1365         }
1366
1367         return 0;
1368 }
1369
1370 static int reset_audit_loginuid(void) {
1371         _cleanup_free_ char *p = NULL;
1372         int r;
1373
1374         if (arg_share_system)
1375                 return 0;
1376
1377         r = read_one_line_file("/proc/self/loginuid", &p);
1378         if (r == -ENOENT)
1379                 return 0;
1380         if (r < 0) {
1381                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1382                 return r;
1383         }
1384
1385         /* Already reset? */
1386         if (streq(p, "4294967295"))
1387                 return 0;
1388
1389         r = write_string_file("/proc/self/loginuid", "4294967295");
1390         if (r < 0) {
1391                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1392                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1393                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1394                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1395                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1396
1397                 sleep(5);
1398         }
1399
1400         return 0;
1401 }
1402
1403 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1404
1405 static int get_mac(struct ether_addr *mac) {
1406         int r;
1407
1408         uint8_t result[8];
1409         size_t l, sz;
1410         uint8_t *v;
1411
1412         l = strlen(arg_machine);
1413         sz = sizeof(sd_id128_t) + l;
1414         v = alloca(sz);
1415
1416         /* fetch some persistent data unique to the host */
1417         r = sd_id128_get_machine((sd_id128_t*) v);
1418         if (r < 0)
1419                 return r;
1420
1421         /* combine with some data unique (on this host) to this
1422          * container instance */
1423         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1424
1425         /* Let's hash the host machine ID plus the container name. We
1426          * use a fixed, but originally randomly created hash key here. */
1427         siphash24(result, v, sz, HASH_KEY.bytes);
1428
1429         assert_cc(ETH_ALEN <= sizeof(result));
1430         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1431
1432         /* see eth_random_addr in the kernel */
1433         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1434         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1435
1436         return 0;
1437 }
1438
1439 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1440         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1441         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1442         struct ether_addr mac;
1443         int r;
1444
1445         if (!arg_private_network)
1446                 return 0;
1447
1448         if (!arg_network_veth)
1449                 return 0;
1450
1451         /* Use two different interface name prefixes depending whether
1452          * we are in bridge mode or not. */
1453         if (arg_network_bridge)
1454                 memcpy(iface_name, "vb-", 3);
1455         else
1456                 memcpy(iface_name, "ve-", 3);
1457         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1458
1459         r = get_mac(&mac);
1460         if (r < 0) {
1461                 log_error("Failed to generate predictable MAC address for host0");
1462                 return r;
1463         }
1464
1465         r = sd_rtnl_open(&rtnl, 0);
1466         if (r < 0) {
1467                 log_error("Failed to connect to netlink: %s", strerror(-r));
1468                 return r;
1469         }
1470
1471         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1472         if (r < 0) {
1473                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1474                 return r;
1475         }
1476
1477         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1478         if (r < 0) {
1479                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1480                 return r;
1481         }
1482
1483         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1484         if (r < 0) {
1485                 log_error("Failed to open netlink container: %s", strerror(-r));
1486                 return r;
1487         }
1488
1489         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1490         if (r < 0) {
1491                 log_error("Failed to append netlink kind: %s", strerror(-r));
1492                 return r;
1493         }
1494
1495         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1496         if (r < 0) {
1497                 log_error("Failed to open netlink container: %s", strerror(-r));
1498                 return r;
1499         }
1500
1501         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1502         if (r < 0) {
1503                 log_error("Failed to open netlink container: %s", strerror(-r));
1504                 return r;
1505         }
1506
1507         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1508         if (r < 0) {
1509                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1510                 return r;
1511         }
1512
1513         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1514         if (r < 0) {
1515                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1516                 return r;
1517         }
1518
1519         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1520         if (r < 0) {
1521                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1522                 return r;
1523         }
1524
1525         r = sd_rtnl_message_close_container(m);
1526         if (r < 0) {
1527                 log_error("Failed to close netlink container: %s", strerror(-r));
1528                 return r;
1529         }
1530
1531         r = sd_rtnl_message_close_container(m);
1532         if (r < 0) {
1533                 log_error("Failed to close netlink container: %s", strerror(-r));
1534                 return r;
1535         }
1536
1537         r = sd_rtnl_message_close_container(m);
1538         if (r < 0) {
1539                 log_error("Failed to close netlink container: %s", strerror(-r));
1540                 return r;
1541         }
1542
1543         r = sd_rtnl_call(rtnl, m, 0, NULL);
1544         if (r < 0) {
1545                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1546                 return r;
1547         }
1548
1549         return 0;
1550 }
1551
1552 static int setup_bridge(const char veth_name[]) {
1553         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1554         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1555         int r, bridge;
1556
1557         if (!arg_private_network)
1558                 return 0;
1559
1560         if (!arg_network_veth)
1561                 return 0;
1562
1563         if (!arg_network_bridge)
1564                 return 0;
1565
1566         bridge = (int) if_nametoindex(arg_network_bridge);
1567         if (bridge <= 0) {
1568                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1569                 return -errno;
1570         }
1571
1572         r = sd_rtnl_open(&rtnl, 0);
1573         if (r < 0) {
1574                 log_error("Failed to connect to netlink: %s", strerror(-r));
1575                 return r;
1576         }
1577
1578         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1579         if (r < 0) {
1580                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1581                 return r;
1582         }
1583
1584         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1585         if (r < 0) {
1586                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1587                 return r;
1588         }
1589
1590         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1591         if (r < 0) {
1592                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1593                 return r;
1594         }
1595
1596         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1597         if (r < 0) {
1598                 log_error("Failed to add netlink master field: %s", strerror(-r));
1599                 return r;
1600         }
1601
1602         r = sd_rtnl_call(rtnl, m, 0, NULL);
1603         if (r < 0) {
1604                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1605                 return r;
1606         }
1607
1608         return 0;
1609 }
1610
1611 static int parse_interface(struct udev *udev, const char *name) {
1612         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1613         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1614         int ifi;
1615
1616         ifi = (int) if_nametoindex(name);
1617         if (ifi <= 0) {
1618                 log_error("Failed to resolve interface %s: %m", name);
1619                 return -errno;
1620         }
1621
1622         sprintf(ifi_str, "n%i", ifi);
1623         d = udev_device_new_from_device_id(udev, ifi_str);
1624         if (!d) {
1625                 log_error("Failed to get udev device for interface %s: %m", name);
1626                 return -errno;
1627         }
1628
1629         if (udev_device_get_is_initialized(d) <= 0) {
1630                 log_error("Network interface %s is not initialized yet.", name);
1631                 return -EBUSY;
1632         }
1633
1634         return ifi;
1635 }
1636
1637 static int move_network_interfaces(pid_t pid) {
1638         _cleanup_udev_unref_ struct udev *udev = NULL;
1639         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1640         char **i;
1641         int r;
1642
1643         if (!arg_private_network)
1644                 return 0;
1645
1646         if (strv_isempty(arg_network_interfaces))
1647                 return 0;
1648
1649         r = sd_rtnl_open(&rtnl, 0);
1650         if (r < 0) {
1651                 log_error("Failed to connect to netlink: %s", strerror(-r));
1652                 return r;
1653         }
1654
1655         udev = udev_new();
1656         if (!udev) {
1657                 log_error("Failed to connect to udev.");
1658                 return -ENOMEM;
1659         }
1660
1661         STRV_FOREACH(i, arg_network_interfaces) {
1662                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1663                 int ifi;
1664
1665                 ifi = parse_interface(udev, *i);
1666                 if (ifi < 0)
1667                         return ifi;
1668
1669                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1670                 if (r < 0) {
1671                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1672                         return r;
1673                 }
1674
1675                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1676                 if (r < 0) {
1677                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1678                         return r;
1679                 }
1680
1681                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1682                 if (r < 0) {
1683                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1684                         return r;
1685                 }
1686         }
1687
1688         return 0;
1689 }
1690
1691 static int setup_macvlan(pid_t pid) {
1692         _cleanup_udev_unref_ struct udev *udev = NULL;
1693         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1694         char **i;
1695         int r;
1696
1697         if (!arg_private_network)
1698                 return 0;
1699
1700         if (strv_isempty(arg_network_macvlan))
1701                 return 0;
1702
1703         r = sd_rtnl_open(&rtnl, 0);
1704         if (r < 0) {
1705                 log_error("Failed to connect to netlink: %s", strerror(-r));
1706                 return r;
1707         }
1708
1709         udev = udev_new();
1710         if (!udev) {
1711                 log_error("Failed to connect to udev.");
1712                 return -ENOMEM;
1713         }
1714
1715         STRV_FOREACH(i, arg_network_macvlan) {
1716                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1717                 _cleanup_free_ char *n = NULL;
1718                 int ifi;
1719
1720                 ifi = parse_interface(udev, *i);
1721                 if (ifi < 0)
1722                         return ifi;
1723
1724                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1725                 if (r < 0) {
1726                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1727                         return r;
1728                 }
1729
1730                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1731                 if (r < 0) {
1732                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1733                         return r;
1734                 }
1735
1736                 n = strappend("mv-", *i);
1737                 if (!n)
1738                         return log_oom();
1739
1740                 strshorten(n, IFNAMSIZ-1);
1741
1742                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1743                 if (r < 0) {
1744                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1745                         return r;
1746                 }
1747
1748                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1749                 if (r < 0) {
1750                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1751                         return r;
1752                 }
1753
1754                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1755                 if (r < 0) {
1756                         log_error("Failed to open netlink container: %s", strerror(-r));
1757                         return r;
1758                 }
1759
1760                 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1761                 if (r < 0) {
1762                         log_error("Failed to append netlink kind: %s", strerror(-r));
1763                         return r;
1764                 }
1765
1766                 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1767                 if (r < 0) {
1768                         log_error("Failed to open netlink container: %s", strerror(-r));
1769                         return r;
1770                 }
1771
1772                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1773                 if (r < 0) {
1774                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1775                         return r;
1776                 }
1777
1778                 r = sd_rtnl_message_close_container(m);
1779                 if (r < 0) {
1780                         log_error("Failed to close netlink container: %s", strerror(-r));
1781                         return r;
1782                 }
1783
1784                 r = sd_rtnl_message_close_container(m);
1785                 if (r < 0) {
1786                         log_error("Failed to close netlink container: %s", strerror(-r));
1787                         return r;
1788                 }
1789
1790                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1791                 if (r < 0) {
1792                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1793                         return r;
1794                 }
1795         }
1796
1797         return 0;
1798 }
1799
1800 static int audit_still_doesnt_work_in_containers(void) {
1801
1802 #ifdef HAVE_SECCOMP
1803         scmp_filter_ctx seccomp;
1804         int r;
1805
1806         /*
1807            Audit is broken in containers, much of the userspace audit
1808            hookup will fail if running inside a container. We don't
1809            care and just turn off creation of audit sockets.
1810
1811            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1812            with EAFNOSUPPORT which audit userspace uses as indication
1813            that audit is disabled in the kernel.
1814          */
1815
1816         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1817         if (!seccomp)
1818                 return log_oom();
1819
1820         r = seccomp_add_secondary_archs(seccomp);
1821         if (r < 0) {
1822                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1823                 goto finish;
1824         }
1825
1826         r = seccomp_rule_add(
1827                         seccomp,
1828                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1829                         SCMP_SYS(socket),
1830                         2,
1831                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1832                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1833         if (r < 0) {
1834                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1835                 goto finish;
1836         }
1837
1838         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1839         if (r < 0) {
1840                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1841                 goto finish;
1842         }
1843
1844         r = seccomp_load(seccomp);
1845         if (r < 0)
1846                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1847
1848 finish:
1849         seccomp_release(seccomp);
1850         return r;
1851 #else
1852         return 0;
1853 #endif
1854
1855 }
1856
1857 static int setup_image(char **device_path, int *loop_nr) {
1858         struct loop_info64 info = {
1859                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1860         };
1861         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1862         _cleanup_free_ char* loopdev = NULL;
1863         struct stat st;
1864         int r, nr;
1865
1866         assert(device_path);
1867         assert(loop_nr);
1868
1869         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1870         if (fd < 0) {
1871                 log_error("Failed to open %s: %m", arg_image);
1872                 return -errno;
1873         }
1874
1875         if (fstat(fd, &st) < 0) {
1876                 log_error("Failed to stat %s: %m", arg_image);
1877                 return -errno;
1878         }
1879
1880         if (S_ISBLK(st.st_mode)) {
1881                 char *p;
1882
1883                 p = strdup(arg_image);
1884                 if (!p)
1885                         return log_oom();
1886
1887                 *device_path = p;
1888
1889                 *loop_nr = -1;
1890
1891                 r = fd;
1892                 fd = -1;
1893
1894                 return r;
1895         }
1896
1897         if (!S_ISREG(st.st_mode)) {
1898                 log_error("%s is not a regular file or block device: %m", arg_image);
1899                 return -EINVAL;
1900         }
1901
1902         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1903         if (control < 0) {
1904                 log_error("Failed to open /dev/loop-control: %m");
1905                 return -errno;
1906         }
1907
1908         nr = ioctl(control, LOOP_CTL_GET_FREE);
1909         if (nr < 0) {
1910                 log_error("Failed to allocate loop device: %m");
1911                 return -errno;
1912         }
1913
1914         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1915                 return log_oom();
1916
1917         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1918         if (loop < 0) {
1919                 log_error("Failed to open loop device %s: %m", loopdev);
1920                 return -errno;
1921         }
1922
1923         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1924                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1925                 return -errno;
1926         }
1927
1928         if (arg_read_only)
1929                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1930
1931         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1932                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1933                 return -errno;
1934         }
1935
1936         *device_path = loopdev;
1937         loopdev = NULL;
1938
1939         *loop_nr = nr;
1940
1941         r = loop;
1942         loop = -1;
1943
1944         return r;
1945 }
1946
1947 static int dissect_image(
1948                 int fd,
1949                 char **root_device, bool *root_device_rw,
1950                 char **home_device, bool *home_device_rw,
1951                 char **srv_device, bool *srv_device_rw,
1952                 bool *secondary) {
1953
1954 #ifdef HAVE_BLKID
1955         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1956         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1957         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1958         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1959         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1960         _cleanup_udev_unref_ struct udev *udev = NULL;
1961         struct udev_list_entry *first, *item;
1962         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1963         const char *pttype = NULL;
1964         blkid_partlist pl;
1965         struct stat st;
1966         int r;
1967
1968         assert(fd >= 0);
1969         assert(root_device);
1970         assert(home_device);
1971         assert(srv_device);
1972         assert(secondary);
1973
1974         b = blkid_new_probe();
1975         if (!b)
1976                 return log_oom();
1977
1978         errno = 0;
1979         r = blkid_probe_set_device(b, fd, 0, 0);
1980         if (r != 0) {
1981                 if (errno == 0)
1982                         return log_oom();
1983
1984                 log_error("Failed to set device on blkid probe: %m");
1985                 return -errno;
1986         }
1987
1988         blkid_probe_enable_partitions(b, 1);
1989         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1990
1991         errno = 0;
1992         r = blkid_do_safeprobe(b);
1993         if (r == -2 || r == 1) {
1994                 log_error("Failed to identify any partition table on %s.\n"
1995                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1996                 return -EINVAL;
1997         } else if (r != 0) {
1998                 if (errno == 0)
1999                         errno = EIO;
2000                 log_error("Failed to probe: %m");
2001                 return -errno;
2002         }
2003
2004         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2005         if (!streq_ptr(pttype, "gpt")) {
2006                 log_error("Image %s does not carry a GUID Partition Table.\n"
2007                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2008                 return -EINVAL;
2009         }
2010
2011         errno = 0;
2012         pl = blkid_probe_get_partitions(b);
2013         if (!pl) {
2014                 if (errno == 0)
2015                         return log_oom();
2016
2017                 log_error("Failed to list partitions of %s", arg_image);
2018                 return -errno;
2019         }
2020
2021         udev = udev_new();
2022         if (!udev)
2023                 return log_oom();
2024
2025         if (fstat(fd, &st) < 0) {
2026                 log_error("Failed to stat block device: %m");
2027                 return -errno;
2028         }
2029
2030         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2031         if (!d)
2032                 return log_oom();
2033
2034         e = udev_enumerate_new(udev);
2035         if (!e)
2036                 return log_oom();
2037
2038         r = udev_enumerate_add_match_parent(e, d);
2039         if (r < 0)
2040                 return log_oom();
2041
2042         r = udev_enumerate_scan_devices(e);
2043         if (r < 0) {
2044                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2045                 return r;
2046         }
2047
2048         first = udev_enumerate_get_list_entry(e);
2049         udev_list_entry_foreach(item, first) {
2050                 _cleanup_udev_device_unref_ struct udev_device *q;
2051                 const char *stype, *node;
2052                 unsigned long long flags;
2053                 sd_id128_t type_id;
2054                 blkid_partition pp;
2055                 dev_t qn;
2056                 int nr;
2057
2058                 errno = 0;
2059                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2060                 if (!q) {
2061                         if (!errno)
2062                                 errno = ENOMEM;
2063
2064                         log_error("Failed to get partition device of %s: %m", arg_image);
2065                         return -errno;
2066                 }
2067
2068                 qn = udev_device_get_devnum(q);
2069                 if (major(qn) == 0)
2070                         continue;
2071
2072                 if (st.st_rdev == qn)
2073                         continue;
2074
2075                 node = udev_device_get_devnode(q);
2076                 if (!node)
2077                         continue;
2078
2079                 pp = blkid_partlist_devno_to_partition(pl, qn);
2080                 if (!pp)
2081                         continue;
2082
2083                 flags = blkid_partition_get_flags(pp);
2084                 if (flags & GPT_FLAG_NO_AUTO)
2085                         continue;
2086
2087                 nr = blkid_partition_get_partno(pp);
2088                 if (nr < 0)
2089                         continue;
2090
2091                 stype = blkid_partition_get_type_string(pp);
2092                 if (!stype)
2093                         continue;
2094
2095                 if (sd_id128_from_string(stype, &type_id) < 0)
2096                         continue;
2097
2098                 if (sd_id128_equal(type_id, GPT_HOME)) {
2099
2100                         if (home && nr >= home_nr)
2101                                 continue;
2102
2103                         home_nr = nr;
2104                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2105
2106                         free(home);
2107                         home = strdup(node);
2108                         if (!home)
2109                                 return log_oom();
2110                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2111
2112                         if (srv && nr >= srv_nr)
2113                                 continue;
2114
2115                         srv_nr = nr;
2116                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2117
2118                         free(srv);
2119                         srv = strdup(node);
2120                         if (!srv)
2121                                 return log_oom();
2122                 }
2123 #ifdef GPT_ROOT_NATIVE
2124                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2125
2126                         if (root && nr >= root_nr)
2127                                 continue;
2128
2129                         root_nr = nr;
2130                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2131
2132                         free(root);
2133                         root = strdup(node);
2134                         if (!root)
2135                                 return log_oom();
2136                 }
2137 #endif
2138 #ifdef GPT_ROOT_SECONDARY
2139                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2140
2141                         if (secondary_root && nr >= secondary_root_nr)
2142                                 continue;
2143
2144                         secondary_root_nr = nr;
2145                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2146
2147
2148                         free(secondary_root);
2149                         secondary_root = strdup(node);
2150                         if (!secondary_root)
2151                                 return log_oom();
2152                 }
2153 #endif
2154         }
2155
2156         if (!root && !secondary_root) {
2157                 log_error("Failed to identify root partition in disk image %s.\n"
2158                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2159                 return -EINVAL;
2160         }
2161
2162         if (root) {
2163                 *root_device = root;
2164                 root = NULL;
2165
2166                 *root_device_rw = root_rw;
2167                 *secondary = false;
2168         } else if (secondary_root) {
2169                 *root_device = secondary_root;
2170                 secondary_root = NULL;
2171
2172                 *root_device_rw = secondary_root_rw;
2173                 *secondary = true;
2174         }
2175
2176         if (home) {
2177                 *home_device = home;
2178                 home = NULL;
2179
2180                 *home_device_rw = home_rw;
2181         }
2182
2183         if (srv) {
2184                 *srv_device = srv;
2185                 srv = NULL;
2186
2187                 *srv_device_rw = srv_rw;
2188         }
2189
2190         return 0;
2191 #else
2192         log_error("--image= is not supported, compiled without blkid support.");
2193         return -ENOTSUP;
2194 #endif
2195 }
2196
2197 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2198 #ifdef HAVE_BLKID
2199         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2200         const char *fstype, *p;
2201         int r;
2202
2203         assert(what);
2204         assert(where);
2205
2206         if (arg_read_only)
2207                 rw = false;
2208
2209         if (directory)
2210                 p = strappenda(where, directory);
2211         else
2212                 p = where;
2213
2214         errno = 0;
2215         b = blkid_new_probe_from_filename(what);
2216         if (!b) {
2217                 if (errno == 0)
2218                         return log_oom();
2219                 log_error("Failed to allocate prober for %s: %m", what);
2220                 return -errno;
2221         }
2222
2223         blkid_probe_enable_superblocks(b, 1);
2224         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2225
2226         errno = 0;
2227         r = blkid_do_safeprobe(b);
2228         if (r == -1 || r == 1) {
2229                 log_error("Cannot determine file system type of %s", what);
2230                 return -EINVAL;
2231         } else if (r != 0) {
2232                 if (errno == 0)
2233                         errno = EIO;
2234                 log_error("Failed to probe %s: %m", what);
2235                 return -errno;
2236         }
2237
2238         errno = 0;
2239         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2240                 if (errno == 0)
2241                         errno = EINVAL;
2242                 log_error("Failed to determine file system type of %s", what);
2243                 return -errno;
2244         }
2245
2246         if (streq(fstype, "crypto_LUKS")) {
2247                 log_error("nspawn currently does not support LUKS disk images.");
2248                 return -ENOTSUP;
2249         }
2250
2251         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2252                 log_error("Failed to mount %s: %m", what);
2253                 return -errno;
2254         }
2255
2256         return 0;
2257 #else
2258         log_error("--image= is not supported, compiled without blkid support.");
2259         return -ENOTSUP;
2260 #endif
2261 }
2262
2263 static int mount_devices(
2264                 const char *where,
2265                 const char *root_device, bool root_device_rw,
2266                 const char *home_device, bool home_device_rw,
2267                 const char *srv_device, bool srv_device_rw) {
2268         int r;
2269
2270         assert(where);
2271
2272         if (root_device) {
2273                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2274                 if (r < 0) {
2275                         log_error("Failed to mount root directory: %s", strerror(-r));
2276                         return r;
2277                 }
2278         }
2279
2280         if (home_device) {
2281                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2282                 if (r < 0) {
2283                         log_error("Failed to mount home directory: %s", strerror(-r));
2284                         return r;
2285                 }
2286         }
2287
2288         if (srv_device) {
2289                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2290                 if (r < 0) {
2291                         log_error("Failed to mount server data directory: %s", strerror(-r));
2292                         return r;
2293                 }
2294         }
2295
2296         return 0;
2297 }
2298
2299 static void loop_remove(int nr, int *image_fd) {
2300         _cleanup_close_ int control = -1;
2301
2302         if (nr < 0)
2303                 return;
2304
2305         if (image_fd && *image_fd >= 0) {
2306                 ioctl(*image_fd, LOOP_CLR_FD);
2307                 close_nointr_nofail(*image_fd);
2308                 *image_fd = -1;
2309         }
2310
2311         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2312         if (control < 0)
2313                 return;
2314
2315         ioctl(control, LOOP_CTL_REMOVE, nr);
2316 }
2317
2318 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2319         int pipe_fds[2];
2320         pid_t pid;
2321
2322         assert(database);
2323         assert(key);
2324         assert(rpid);
2325
2326         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2327                 log_error("Failed to allocate pipe: %m");
2328                 return -errno;
2329         }
2330
2331         pid = fork();
2332         if (pid < 0) {
2333                 log_error("Failed to fork getent child: %m");
2334                 return -errno;
2335         } else if (pid == 0) {
2336                 int nullfd;
2337                 char *empty_env = NULL;
2338
2339                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2340                         _exit(EXIT_FAILURE);
2341
2342                 if (pipe_fds[0] > 2)
2343                         close_nointr_nofail(pipe_fds[0]);
2344                 if (pipe_fds[1] > 2)
2345                         close_nointr_nofail(pipe_fds[1]);
2346
2347                 nullfd = open("/dev/null", O_RDWR);
2348                 if (nullfd < 0)
2349                         _exit(EXIT_FAILURE);
2350
2351                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2352                         _exit(EXIT_FAILURE);
2353
2354                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2355                         _exit(EXIT_FAILURE);
2356
2357                 if (nullfd > 2)
2358                         close_nointr_nofail(nullfd);
2359
2360                 reset_all_signal_handlers();
2361                 close_all_fds(NULL, 0);
2362
2363                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2364                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2365                 _exit(EXIT_FAILURE);
2366         }
2367
2368         close_nointr_nofail(pipe_fds[1]);
2369         pipe_fds[1] = -1;
2370
2371         *rpid = pid;
2372
2373         return pipe_fds[0];
2374 }
2375
2376 static int change_uid_gid(char **_home) {
2377         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2378         _cleanup_free_ uid_t *uids = NULL;
2379         _cleanup_free_ char *home = NULL;
2380         _cleanup_fclose_ FILE *f = NULL;
2381         _cleanup_close_ int fd = -1;
2382         unsigned n_uids = 0;
2383         size_t sz, l;
2384         uid_t uid;
2385         gid_t gid;
2386         pid_t pid;
2387         int r;
2388
2389         assert(_home);
2390
2391         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2392                 /* Reset everything fully to 0, just in case */
2393
2394                 if (setgroups(0, NULL) < 0) {
2395                         log_error("setgroups() failed: %m");
2396                         return -errno;
2397                 }
2398
2399                 if (setresgid(0, 0, 0) < 0) {
2400                         log_error("setregid() failed: %m");
2401                         return -errno;
2402                 }
2403
2404                 if (setresuid(0, 0, 0) < 0) {
2405                         log_error("setreuid() failed: %m");
2406                         return -errno;
2407                 }
2408
2409                 *_home = NULL;
2410                 return 0;
2411         }
2412
2413         /* First, get user credentials */
2414         fd = spawn_getent("passwd", arg_user, &pid);
2415         if (fd < 0)
2416                 return fd;
2417
2418         f = fdopen(fd, "r");
2419         if (!f)
2420                 return log_oom();
2421         fd = -1;
2422
2423         if (!fgets(line, sizeof(line), f)) {
2424
2425                 if (!ferror(f)) {
2426                         log_error("Failed to resolve user %s.", arg_user);
2427                         return -ESRCH;
2428                 }
2429
2430                 log_error("Failed to read from getent: %m");
2431                 return -errno;
2432         }
2433
2434         truncate_nl(line);
2435
2436         wait_for_terminate_and_warn("getent passwd", pid);
2437
2438         x = strchr(line, ':');
2439         if (!x) {
2440                 log_error("/etc/passwd entry has invalid user field.");
2441                 return -EIO;
2442         }
2443
2444         u = strchr(x+1, ':');
2445         if (!u) {
2446                 log_error("/etc/passwd entry has invalid password field.");
2447                 return -EIO;
2448         }
2449
2450         u++;
2451         g = strchr(u, ':');
2452         if (!g) {
2453                 log_error("/etc/passwd entry has invalid UID field.");
2454                 return -EIO;
2455         }
2456
2457         *g = 0;
2458         g++;
2459         x = strchr(g, ':');
2460         if (!x) {
2461                 log_error("/etc/passwd entry has invalid GID field.");
2462                 return -EIO;
2463         }
2464
2465         *x = 0;
2466         h = strchr(x+1, ':');
2467         if (!h) {
2468                 log_error("/etc/passwd entry has invalid GECOS field.");
2469                 return -EIO;
2470         }
2471
2472         h++;
2473         x = strchr(h, ':');
2474         if (!x) {
2475                 log_error("/etc/passwd entry has invalid home directory field.");
2476                 return -EIO;
2477         }
2478
2479         *x = 0;
2480
2481         r = parse_uid(u, &uid);
2482         if (r < 0) {
2483                 log_error("Failed to parse UID of user.");
2484                 return -EIO;
2485         }
2486
2487         r = parse_gid(g, &gid);
2488         if (r < 0) {
2489                 log_error("Failed to parse GID of user.");
2490                 return -EIO;
2491         }
2492
2493         home = strdup(h);
2494         if (!home)
2495                 return log_oom();
2496
2497         /* Second, get group memberships */
2498         fd = spawn_getent("initgroups", arg_user, &pid);
2499         if (fd < 0)
2500                 return fd;
2501
2502         fclose(f);
2503         f = fdopen(fd, "r");
2504         if (!f)
2505                 return log_oom();
2506         fd = -1;
2507
2508         if (!fgets(line, sizeof(line), f)) {
2509                 if (!ferror(f)) {
2510                         log_error("Failed to resolve user %s.", arg_user);
2511                         return -ESRCH;
2512                 }
2513
2514                 log_error("Failed to read from getent: %m");
2515                 return -errno;
2516         }
2517
2518         truncate_nl(line);
2519
2520         wait_for_terminate_and_warn("getent initgroups", pid);
2521
2522         /* Skip over the username and subsequent separator whitespace */
2523         x = line;
2524         x += strcspn(x, WHITESPACE);
2525         x += strspn(x, WHITESPACE);
2526
2527         FOREACH_WORD(w, l, x, state) {
2528                 char c[l+1];
2529
2530                 memcpy(c, w, l);
2531                 c[l] = 0;
2532
2533                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2534                         return log_oom();
2535
2536                 r = parse_uid(c, &uids[n_uids++]);
2537                 if (r < 0) {
2538                         log_error("Failed to parse group data from getent.");
2539                         return -EIO;
2540                 }
2541         }
2542
2543         r = mkdir_parents(home, 0775);
2544         if (r < 0) {
2545                 log_error("Failed to make home root directory: %s", strerror(-r));
2546                 return r;
2547         }
2548
2549         r = mkdir_safe(home, 0755, uid, gid);
2550         if (r < 0 && r != -EEXIST) {
2551                 log_error("Failed to make home directory: %s", strerror(-r));
2552                 return r;
2553         }
2554
2555         fchown(STDIN_FILENO, uid, gid);
2556         fchown(STDOUT_FILENO, uid, gid);
2557         fchown(STDERR_FILENO, uid, gid);
2558
2559         if (setgroups(n_uids, uids) < 0) {
2560                 log_error("Failed to set auxiliary groups: %m");
2561                 return -errno;
2562         }
2563
2564         if (setresgid(gid, gid, gid) < 0) {
2565                 log_error("setregid() failed: %m");
2566                 return -errno;
2567         }
2568
2569         if (setresuid(uid, uid, uid) < 0) {
2570                 log_error("setreuid() failed: %m");
2571                 return -errno;
2572         }
2573
2574         if (_home) {
2575                 *_home = home;
2576                 home = NULL;
2577         }
2578
2579         return 0;
2580 }
2581
2582 int main(int argc, char *argv[]) {
2583
2584         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2585         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2586         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2587         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2588         _cleanup_fdset_free_ FDSet *fds = NULL;
2589         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2590         const char *console = NULL;
2591         char veth_name[IFNAMSIZ];
2592         bool secondary = false;
2593         pid_t pid = 0;
2594         sigset_t mask;
2595
2596         log_parse_environment();
2597         log_open();
2598
2599         k = parse_argv(argc, argv);
2600         if (k < 0)
2601                 goto finish;
2602         else if (k == 0) {
2603                 r = EXIT_SUCCESS;
2604                 goto finish;
2605         }
2606
2607         if (!arg_image) {
2608                 if (arg_directory) {
2609                         char *p;
2610
2611                         p = path_make_absolute_cwd(arg_directory);
2612                         free(arg_directory);
2613                         arg_directory = p;
2614                 } else
2615                         arg_directory = get_current_dir_name();
2616
2617                 if (!arg_directory) {
2618                         log_error("Failed to determine path, please use -D.");
2619                         goto finish;
2620                 }
2621                 path_kill_slashes(arg_directory);
2622         }
2623
2624         if (!arg_machine) {
2625                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2626                 if (!arg_machine) {
2627                         log_oom();
2628                         goto finish;
2629                 }
2630
2631                 hostname_cleanup(arg_machine, false);
2632                 if (isempty(arg_machine)) {
2633                         log_error("Failed to determine machine name automatically, please use -M.");
2634                         goto finish;
2635                 }
2636         }
2637
2638         if (geteuid() != 0) {
2639                 log_error("Need to be root.");
2640                 goto finish;
2641         }
2642
2643         if (sd_booted() <= 0) {
2644                 log_error("Not running on a systemd system.");
2645                 goto finish;
2646         }
2647
2648         log_close();
2649         n_fd_passed = sd_listen_fds(false);
2650         if (n_fd_passed > 0) {
2651                 k = fdset_new_listen_fds(&fds, false);
2652                 if (k < 0) {
2653                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2654                         goto finish;
2655                 }
2656         }
2657         fdset_close_others(fds);
2658         log_open();
2659
2660         if (arg_directory) {
2661                 if (path_equal(arg_directory, "/")) {
2662                         log_error("Spawning container on root directory not supported.");
2663                         goto finish;
2664                 }
2665
2666                 if (arg_boot) {
2667                         if (path_is_os_tree(arg_directory) <= 0) {
2668                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2669                                 goto finish;
2670                         }
2671                 } else {
2672                         const char *p;
2673
2674                         p = strappenda(arg_directory,
2675                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2676                         if (access(p, F_OK) < 0) {
2677                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2678                                 goto finish;
2679
2680                         }
2681                 }
2682         } else {
2683                 char template[] = "/tmp/nspawn-root-XXXXXX";
2684
2685                 if (!mkdtemp(template)) {
2686                         log_error("Failed to create temporary directory: %m");
2687                         r = -errno;
2688                         goto finish;
2689                 }
2690
2691                 arg_directory = strdup(template);
2692                 if (!arg_directory) {
2693                         r = log_oom();
2694                         goto finish;
2695                 }
2696
2697                 image_fd = setup_image(&device_path, &loop_nr);
2698                 if (image_fd < 0) {
2699                         r = image_fd;
2700                         goto finish;
2701                 }
2702
2703                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2704                 if (r < 0)
2705                         goto finish;
2706         }
2707
2708         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2709         if (master < 0) {
2710                 log_error("Failed to acquire pseudo tty: %m");
2711                 goto finish;
2712         }
2713
2714         console = ptsname(master);
2715         if (!console) {
2716                 log_error("Failed to determine tty name: %m");
2717                 goto finish;
2718         }
2719
2720         if (!arg_quiet)
2721                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2722
2723         if (unlockpt(master) < 0) {
2724                 log_error("Failed to unlock tty: %m");
2725                 goto finish;
2726         }
2727
2728         if (access("/dev/kdbus/control", F_OK) >= 0) {
2729
2730                 if (arg_share_system) {
2731                         kdbus_domain = strdup("/dev/kdbus");
2732                         if (!kdbus_domain) {
2733                                 log_oom();
2734                                 goto finish;
2735                         }
2736                 } else {
2737                         const char *ns;
2738
2739                         ns = strappenda("machine-", arg_machine);
2740                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2741                         if (r < 0)
2742                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2743                         else
2744                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2745                 }
2746         }
2747
2748         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2749                 log_error("Failed to create kmsg socket pair: %m");
2750                 goto finish;
2751         }
2752
2753         sd_notify(0, "READY=1");
2754
2755         assert_se(sigemptyset(&mask) == 0);
2756         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2757         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2758
2759         for (;;) {
2760                 int parent_ready_fd = -1, child_ready_fd = -1;
2761                 siginfo_t status;
2762                 eventfd_t x;
2763
2764                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2765                 if (parent_ready_fd < 0) {
2766                         log_error("Failed to create event fd: %m");
2767                         goto finish;
2768                 }
2769
2770                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2771                 if (child_ready_fd < 0) {
2772                         log_error("Failed to create event fd: %m");
2773                         goto finish;
2774                 }
2775
2776                 pid = syscall(__NR_clone,
2777                               SIGCHLD|CLONE_NEWNS|
2778                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2779                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2780                 if (pid < 0) {
2781                         if (errno == EINVAL)
2782                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2783                         else
2784                                 log_error("clone() failed: %m");
2785
2786                         goto finish;
2787                 }
2788
2789                 if (pid == 0) {
2790                         /* child */
2791                         _cleanup_free_ char *home = NULL;
2792                         unsigned n_env = 2;
2793                         const char *envp[] = {
2794                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2795                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2796                                 NULL, /* TERM */
2797                                 NULL, /* HOME */
2798                                 NULL, /* USER */
2799                                 NULL, /* LOGNAME */
2800                                 NULL, /* container_uuid */
2801                                 NULL, /* LISTEN_FDS */
2802                                 NULL, /* LISTEN_PID */
2803                                 NULL
2804                         };
2805                         char **env_use;
2806
2807                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2808                         if (envp[n_env])
2809                                 n_env ++;
2810
2811                         close_nointr_nofail(master);
2812                         master = -1;
2813
2814                         close_nointr(STDIN_FILENO);
2815                         close_nointr(STDOUT_FILENO);
2816                         close_nointr(STDERR_FILENO);
2817
2818                         close_nointr_nofail(kmsg_socket_pair[0]);
2819                         kmsg_socket_pair[0] = -1;
2820
2821                         reset_all_signal_handlers();
2822
2823                         assert_se(sigemptyset(&mask) == 0);
2824                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2825
2826                         k = open_terminal(console, O_RDWR);
2827                         if (k != STDIN_FILENO) {
2828                                 if (k >= 0) {
2829                                         close_nointr_nofail(k);
2830                                         k = -EINVAL;
2831                                 }
2832
2833                                 log_error("Failed to open console: %s", strerror(-k));
2834                                 goto child_fail;
2835                         }
2836
2837                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2838                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2839                                 log_error("Failed to duplicate console: %m");
2840                                 goto child_fail;
2841                         }
2842
2843                         if (setsid() < 0) {
2844                                 log_error("setsid() failed: %m");
2845                                 goto child_fail;
2846                         }
2847
2848                         if (reset_audit_loginuid() < 0)
2849                                 goto child_fail;
2850
2851                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2852                                 log_error("PR_SET_PDEATHSIG failed: %m");
2853                                 goto child_fail;
2854                         }
2855
2856                         /* Mark everything as slave, so that we still
2857                          * receive mounts from the real root, but don't
2858                          * propagate mounts to the real root. */
2859                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2860                                 log_error("MS_SLAVE|MS_REC failed: %m");
2861                                 goto child_fail;
2862                         }
2863
2864                         if (mount_devices(arg_directory,
2865                                           root_device, root_device_rw,
2866                                           home_device, home_device_rw,
2867                                           srv_device, srv_device_rw) < 0)
2868                                 goto child_fail;
2869
2870                         /* Turn directory into bind mount */
2871                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2872                                 log_error("Failed to make bind mount.");
2873                                 goto child_fail;
2874                         }
2875
2876                         if (arg_read_only)
2877                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2878                                         log_error("Failed to make read-only.");
2879                                         goto child_fail;
2880                                 }
2881
2882                         if (mount_all(arg_directory) < 0)
2883                                 goto child_fail;
2884
2885                         if (copy_devnodes(arg_directory) < 0)
2886                                 goto child_fail;
2887
2888                         if (setup_ptmx(arg_directory) < 0)
2889                                 goto child_fail;
2890
2891                         dev_setup(arg_directory);
2892
2893                         if (audit_still_doesnt_work_in_containers() < 0)
2894                                 goto child_fail;
2895
2896                         if (setup_dev_console(arg_directory, console) < 0)
2897                                 goto child_fail;
2898
2899                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2900                                 goto child_fail;
2901
2902                         close_nointr_nofail(kmsg_socket_pair[1]);
2903                         kmsg_socket_pair[1] = -1;
2904
2905                         if (setup_boot_id(arg_directory) < 0)
2906                                 goto child_fail;
2907
2908                         if (setup_timezone(arg_directory) < 0)
2909                                 goto child_fail;
2910
2911                         if (setup_resolv_conf(arg_directory) < 0)
2912                                 goto child_fail;
2913
2914                         if (setup_journal(arg_directory) < 0)
2915                                 goto child_fail;
2916
2917                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2918                                 goto child_fail;
2919
2920                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2921                                 goto child_fail;
2922
2923                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2924                                 goto child_fail;
2925
2926                         /* Tell the parent that we are ready, and that
2927                          * it can cgroupify us to that we lack access
2928                          * to certain devices and resources. */
2929                         eventfd_write(child_ready_fd, 1);
2930                         close_nointr_nofail(child_ready_fd);
2931                         child_ready_fd = -1;
2932
2933                         if (chdir(arg_directory) < 0) {
2934                                 log_error("chdir(%s) failed: %m", arg_directory);
2935                                 goto child_fail;
2936                         }
2937
2938                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2939                                 log_error("mount(MS_MOVE) failed: %m");
2940                                 goto child_fail;
2941                         }
2942
2943                         if (chroot(".") < 0) {
2944                                 log_error("chroot() failed: %m");
2945                                 goto child_fail;
2946                         }
2947
2948                         if (chdir("/") < 0) {
2949                                 log_error("chdir() failed: %m");
2950                                 goto child_fail;
2951                         }
2952
2953                         umask(0022);
2954
2955                         if (arg_private_network)
2956                                 loopback_setup();
2957
2958                         if (drop_capabilities() < 0) {
2959                                 log_error("drop_capabilities() failed: %m");
2960                                 goto child_fail;
2961                         }
2962
2963                         r = change_uid_gid(&home);
2964                         if (r < 0)
2965                                 goto child_fail;
2966
2967                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2968                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2969                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2970                                 log_oom();
2971                                 goto child_fail;
2972                         }
2973
2974                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2975                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2976                                         log_oom();
2977                                         goto child_fail;
2978                                 }
2979                         }
2980
2981                         if (fdset_size(fds) > 0) {
2982                                 k = fdset_cloexec(fds, false);
2983                                 if (k < 0) {
2984                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2985                                         goto child_fail;
2986                                 }
2987
2988                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2989                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2990                                         log_oom();
2991                                         goto child_fail;
2992                                 }
2993                         }
2994
2995                         setup_hostname();
2996
2997                         if (arg_personality != 0xffffffffLU) {
2998                                 if (personality(arg_personality) < 0) {
2999                                         log_error("personality() failed: %m");
3000                                         goto child_fail;
3001                                 }
3002                         } else if (secondary) {
3003                                 if (personality(PER_LINUX32) < 0) {
3004                                         log_error("personality() failed: %m");
3005                                         goto child_fail;
3006                                 }
3007                         }
3008
3009 #ifdef HAVE_SELINUX
3010                         if (arg_selinux_context)
3011                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3012                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3013                                         goto child_fail;
3014                                 }
3015 #endif
3016
3017                         if (!strv_isempty(arg_setenv)) {
3018                                 char **n;
3019
3020                                 n = strv_env_merge(2, envp, arg_setenv);
3021                                 if (!n) {
3022                                         log_oom();
3023                                         goto child_fail;
3024                                 }
3025
3026                                 env_use = n;
3027                         } else
3028                                 env_use = (char**) envp;
3029
3030                         /* Wait until the parent is ready with the setup, too... */
3031                         eventfd_read(parent_ready_fd, &x);
3032                         close_nointr_nofail(parent_ready_fd);
3033                         parent_ready_fd = -1;
3034
3035                         if (arg_boot) {
3036                                 char **a;
3037                                 size_t l;
3038
3039                                 /* Automatically search for the init system */
3040
3041                                 l = 1 + argc - optind;
3042                                 a = newa(char*, l + 1);
3043                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3044
3045                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3046                                 execve(a[0], a, env_use);
3047
3048                                 a[0] = (char*) "/lib/systemd/systemd";
3049                                 execve(a[0], a, env_use);
3050
3051                                 a[0] = (char*) "/sbin/init";
3052                                 execve(a[0], a, env_use);
3053                         } else if (argc > optind)
3054                                 execvpe(argv[optind], argv + optind, env_use);
3055                         else {
3056                                 chdir(home ? home : "/root");
3057                                 execle("/bin/bash", "-bash", NULL, env_use);
3058                                 execle("/bin/sh", "-sh", NULL, env_use);
3059                         }
3060
3061                         log_error("execv() failed: %m");
3062
3063                 child_fail:
3064                         _exit(EXIT_FAILURE);
3065                 }
3066
3067                 fdset_free(fds);
3068                 fds = NULL;
3069
3070                 /* Wait until the child reported that it is ready with
3071                  * all it needs to do with priviliges. After we got
3072                  * the notification we can make the process join its
3073                  * cgroup which might limit what it can do */
3074                 eventfd_read(child_ready_fd, &x);
3075
3076                 r = register_machine(pid);
3077                 if (r < 0)
3078                         goto finish;
3079
3080                 r = move_network_interfaces(pid);
3081                 if (r < 0)
3082                         goto finish;
3083
3084                 r = setup_veth(pid, veth_name);
3085                 if (r < 0)
3086                         goto finish;
3087
3088                 r = setup_bridge(veth_name);
3089                 if (r < 0)
3090                         goto finish;
3091
3092                 r = setup_macvlan(pid);
3093                 if (r < 0)
3094                         goto finish;
3095
3096                 /* Notify the child that the parent is ready with all
3097                  * its setup, and thtat the child can now hand over
3098                  * control to the code to run inside the container. */
3099                 eventfd_write(parent_ready_fd, 1);
3100
3101                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3102                 if (k < 0) {
3103                         r = EXIT_FAILURE;
3104                         break;
3105                 }
3106
3107                 if (!arg_quiet)
3108                         putc('\n', stdout);
3109
3110                 /* Kill if it is not dead yet anyway */
3111                 terminate_machine(pid);
3112
3113                 /* Redundant, but better safe than sorry */
3114                 kill(pid, SIGKILL);
3115
3116                 k = wait_for_terminate(pid, &status);
3117                 pid = 0;
3118
3119                 if (k < 0) {
3120                         r = EXIT_FAILURE;
3121                         break;
3122                 }
3123
3124                 if (status.si_code == CLD_EXITED) {
3125                         r = status.si_status;
3126                         if (status.si_status != 0) {
3127                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3128                                 break;
3129                         }
3130
3131                         if (!arg_quiet)
3132                                 log_debug("Container %s exited successfully.", arg_machine);
3133                         break;
3134                 } else if (status.si_code == CLD_KILLED &&
3135                            status.si_status == SIGINT) {
3136
3137                         if (!arg_quiet)
3138                                 log_info("Container %s has been shut down.", arg_machine);
3139                         r = 0;
3140                         break;
3141                 } else if (status.si_code == CLD_KILLED &&
3142                            status.si_status == SIGHUP) {
3143
3144                         if (!arg_quiet)
3145                                 log_info("Container %s is being rebooted.", arg_machine);
3146                         continue;
3147                 } else if (status.si_code == CLD_KILLED ||
3148                            status.si_code == CLD_DUMPED) {
3149
3150                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3151                         r = EXIT_FAILURE;
3152                         break;
3153                 } else {
3154                         log_error("Container %s failed due to unknown reason.", arg_machine);
3155                         r = EXIT_FAILURE;
3156                         break;
3157                 }
3158         }
3159
3160 finish:
3161         loop_remove(loop_nr, &image_fd);
3162
3163         if (pid > 0)
3164                 kill(pid, SIGKILL);
3165
3166         free(arg_directory);
3167         free(arg_machine);
3168         free(arg_user);
3169         strv_free(arg_setenv);
3170         strv_free(arg_network_interfaces);
3171         strv_free(arg_network_macvlan);
3172         strv_free(arg_bind);
3173         strv_free(arg_bind_ro);
3174
3175         return r;
3176 }