chiark / gitweb /
fix spelling of privilege
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94
95 typedef enum LinkJournal {
96         LINK_NO,
97         LINK_AUTO,
98         LINK_HOST,
99         LINK_GUEST
100 } LinkJournal;
101
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114         (1ULL << CAP_CHOWN) |
115         (1ULL << CAP_DAC_OVERRIDE) |
116         (1ULL << CAP_DAC_READ_SEARCH) |
117         (1ULL << CAP_FOWNER) |
118         (1ULL << CAP_FSETID) |
119         (1ULL << CAP_IPC_OWNER) |
120         (1ULL << CAP_KILL) |
121         (1ULL << CAP_LEASE) |
122         (1ULL << CAP_LINUX_IMMUTABLE) |
123         (1ULL << CAP_NET_BIND_SERVICE) |
124         (1ULL << CAP_NET_BROADCAST) |
125         (1ULL << CAP_NET_RAW) |
126         (1ULL << CAP_SETGID) |
127         (1ULL << CAP_SETFCAP) |
128         (1ULL << CAP_SETPCAP) |
129         (1ULL << CAP_SETUID) |
130         (1ULL << CAP_SYS_ADMIN) |
131         (1ULL << CAP_SYS_CHROOT) |
132         (1ULL << CAP_SYS_NICE) |
133         (1ULL << CAP_SYS_PTRACE) |
134         (1ULL << CAP_SYS_TTY_CONFIG) |
135         (1ULL << CAP_SYS_RESOURCE) |
136         (1ULL << CAP_SYS_BOOT) |
137         (1ULL << CAP_AUDIT_WRITE) |
138         (1ULL << CAP_AUDIT_CONTROL) |
139         (1ULL << CAP_MKNOD);
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
153
154 static int help(void) {
155
156         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158                "  -h --help                 Show this help\n"
159                "     --version              Print version string\n"
160                "  -q --quiet                Do not show status information\n"
161                "  -D --directory=PATH       Root directory for the container\n"
162                "  -i --image=PATH           File system device or image for the container\n"
163                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
164                "  -u --user=USER            Run the command under specified user or uid\n"
165                "  -M --machine=NAME         Set the machine name for the container\n"
166                "     --uuid=UUID            Set a specific machine UUID for the container\n"
167                "  -S --slice=SLICE          Place the container in the specified slice\n"
168                "     --private-network      Disable network in container\n"
169                "     --network-interface=INTERFACE\n"
170                "                            Assign an existing network interface to the\n"
171                "                            container\n"
172                "     --network-macvlan=INTERFACE\n"
173                "                            Create a macvlan network interface based on an\n"
174                "                            existing network interface to the container\n"
175                "     --network-veth         Add a virtual ethernet connection between host\n"
176                "                            and container\n"
177                "     --network-bridge=INTERFACE\n"
178                "                            Add a virtual ethernet connection between host\n"
179                "                            and container and add it to an existing bridge on\n"
180                "                            the host\n"
181                "  -Z --selinux-context=SECLABEL\n"
182                "                            Set the SELinux security context to be used by\n"
183                "                            processes in the container\n"
184                "  -L --selinux-apifs-context=SECLABEL\n"
185                "                            Set the SELinux security context to be used by\n"
186                "                            API/tmpfs file systems in the container\n"
187                "     --capability=CAP       In addition to the default, retain specified\n"
188                "                            capability\n"
189                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
190                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
191                "  -j                        Equivalent to --link-journal=host\n"
192                "     --read-only            Mount the root directory read-only\n"
193                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
194                "                            the container\n"
195                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
196                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
197                "     --share-system         Share system namespaces with host\n"
198                "     --register=BOOLEAN     Register container as machine\n"
199                "     --keep-unit            Do not register a scope for the machine, reuse\n"
200                "                            the service unit nspawn is running in\n",
201                program_invocation_short_name);
202
203         return 0;
204 }
205
206 static int parse_argv(int argc, char *argv[]) {
207
208         enum {
209                 ARG_VERSION = 0x100,
210                 ARG_PRIVATE_NETWORK,
211                 ARG_UUID,
212                 ARG_READ_ONLY,
213                 ARG_CAPABILITY,
214                 ARG_DROP_CAPABILITY,
215                 ARG_LINK_JOURNAL,
216                 ARG_BIND,
217                 ARG_BIND_RO,
218                 ARG_SETENV,
219                 ARG_SHARE_SYSTEM,
220                 ARG_REGISTER,
221                 ARG_KEEP_UNIT,
222                 ARG_NETWORK_INTERFACE,
223                 ARG_NETWORK_MACVLAN,
224                 ARG_NETWORK_VETH,
225                 ARG_NETWORK_BRIDGE,
226                 ARG_PERSONALITY,
227         };
228
229         static const struct option options[] = {
230                 { "help",                  no_argument,       NULL, 'h'                   },
231                 { "version",               no_argument,       NULL, ARG_VERSION           },
232                 { "directory",             required_argument, NULL, 'D'                   },
233                 { "user",                  required_argument, NULL, 'u'                   },
234                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
235                 { "boot",                  no_argument,       NULL, 'b'                   },
236                 { "uuid",                  required_argument, NULL, ARG_UUID              },
237                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
238                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
239                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
240                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
241                 { "bind",                  required_argument, NULL, ARG_BIND              },
242                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
243                 { "machine",               required_argument, NULL, 'M'                   },
244                 { "slice",                 required_argument, NULL, 'S'                   },
245                 { "setenv",                required_argument, NULL, ARG_SETENV            },
246                 { "selinux-context",       required_argument, NULL, 'Z'                   },
247                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
248                 { "quiet",                 no_argument,       NULL, 'q'                   },
249                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
250                 { "register",              required_argument, NULL, ARG_REGISTER          },
251                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
252                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
253                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
254                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
255                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
256                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
257                 { "image",                 required_argument, NULL, 'i'                   },
258                 {}
259         };
260
261         int c, r;
262         uint64_t plus = 0, minus = 0;
263
264         assert(argc >= 0);
265         assert(argv);
266
267         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
268
269                 switch (c) {
270
271                 case 'h':
272                         return help();
273
274                 case ARG_VERSION:
275                         puts(PACKAGE_STRING);
276                         puts(SYSTEMD_FEATURES);
277                         return 0;
278
279                 case 'D':
280                         free(arg_directory);
281                         arg_directory = canonicalize_file_name(optarg);
282                         if (!arg_directory) {
283                                 log_error("Invalid root directory: %m");
284                                 return -ENOMEM;
285                         }
286
287                         break;
288
289                 case 'i':
290                         arg_image = optarg;
291                         break;
292
293                 case 'u':
294                         free(arg_user);
295                         arg_user = strdup(optarg);
296                         if (!arg_user)
297                                 return log_oom();
298
299                         break;
300
301                 case ARG_NETWORK_BRIDGE:
302                         arg_network_bridge = optarg;
303
304                         /* fall through */
305
306                 case ARG_NETWORK_VETH:
307                         arg_network_veth = true;
308                         arg_private_network = true;
309                         break;
310
311                 case ARG_NETWORK_INTERFACE:
312                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
313                                 return log_oom();
314
315                         arg_private_network = true;
316                         break;
317
318                 case ARG_NETWORK_MACVLAN:
319                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
320                                 return log_oom();
321
322                         /* fall through */
323
324                 case ARG_PRIVATE_NETWORK:
325                         arg_private_network = true;
326                         break;
327
328                 case 'b':
329                         arg_boot = true;
330                         break;
331
332                 case ARG_UUID:
333                         r = sd_id128_from_string(optarg, &arg_uuid);
334                         if (r < 0) {
335                                 log_error("Invalid UUID: %s", optarg);
336                                 return r;
337                         }
338                         break;
339
340                 case 'S':
341                         arg_slice = optarg;
342                         break;
343
344                 case 'M':
345                         if (isempty(optarg)) {
346                                 free(arg_machine);
347                                 arg_machine = NULL;
348                         } else {
349
350                                 if (!hostname_is_valid(optarg)) {
351                                         log_error("Invalid machine name: %s", optarg);
352                                         return -EINVAL;
353                                 }
354
355                                 free(arg_machine);
356                                 arg_machine = strdup(optarg);
357                                 if (!arg_machine)
358                                         return log_oom();
359
360                                 break;
361                         }
362
363                 case 'Z':
364                         arg_selinux_context = optarg;
365                         break;
366
367                 case 'L':
368                         arg_selinux_apifs_context = optarg;
369                         break;
370
371                 case ARG_READ_ONLY:
372                         arg_read_only = true;
373                         break;
374
375                 case ARG_CAPABILITY:
376                 case ARG_DROP_CAPABILITY: {
377                         char *state, *word;
378                         size_t length;
379
380                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381                                 _cleanup_free_ char *t;
382                                 cap_value_t cap;
383
384                                 t = strndup(word, length);
385                                 if (!t)
386                                         return log_oom();
387
388                                 if (streq(t, "all")) {
389                                         if (c == ARG_CAPABILITY)
390                                                 plus = (uint64_t) -1;
391                                         else
392                                                 minus = (uint64_t) -1;
393                                 } else {
394                                         if (cap_from_name(t, &cap) < 0) {
395                                                 log_error("Failed to parse capability %s.", t);
396                                                 return -EINVAL;
397                                         }
398
399                                         if (c == ARG_CAPABILITY)
400                                                 plus |= 1ULL << (uint64_t) cap;
401                                         else
402                                                 minus |= 1ULL << (uint64_t) cap;
403                                 }
404                         }
405
406                         break;
407                 }
408
409                 case 'j':
410                         arg_link_journal = LINK_GUEST;
411                         break;
412
413                 case ARG_LINK_JOURNAL:
414                         if (streq(optarg, "auto"))
415                                 arg_link_journal = LINK_AUTO;
416                         else if (streq(optarg, "no"))
417                                 arg_link_journal = LINK_NO;
418                         else if (streq(optarg, "guest"))
419                                 arg_link_journal = LINK_GUEST;
420                         else if (streq(optarg, "host"))
421                                 arg_link_journal = LINK_HOST;
422                         else {
423                                 log_error("Failed to parse link journal mode %s", optarg);
424                                 return -EINVAL;
425                         }
426
427                         break;
428
429                 case ARG_BIND:
430                 case ARG_BIND_RO: {
431                         _cleanup_free_ char *a = NULL, *b = NULL;
432                         char *e;
433                         char ***x;
434
435                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436
437                         e = strchr(optarg, ':');
438                         if (e) {
439                                 a = strndup(optarg, e - optarg);
440                                 b = strdup(e + 1);
441                         } else {
442                                 a = strdup(optarg);
443                                 b = strdup(optarg);
444                         }
445
446                         if (!a || !b)
447                                 return log_oom();
448
449                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
450                                 log_error("Invalid bind mount specification: %s", optarg);
451                                 return -EINVAL;
452                         }
453
454                         r = strv_extend(x, a);
455                         if (r < 0)
456                                 return log_oom();
457
458                         r = strv_extend(x, b);
459                         if (r < 0)
460                                 return log_oom();
461
462                         break;
463                 }
464
465                 case ARG_SETENV: {
466                         char **n;
467
468                         if (!env_assignment_is_valid(optarg)) {
469                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
470                                 return -EINVAL;
471                         }
472
473                         n = strv_env_set(arg_setenv, optarg);
474                         if (!n)
475                                 return log_oom();
476
477                         strv_free(arg_setenv);
478                         arg_setenv = n;
479                         break;
480                 }
481
482                 case 'q':
483                         arg_quiet = true;
484                         break;
485
486                 case ARG_SHARE_SYSTEM:
487                         arg_share_system = true;
488                         break;
489
490                 case ARG_REGISTER:
491                         r = parse_boolean(optarg);
492                         if (r < 0) {
493                                 log_error("Failed to parse --register= argument: %s", optarg);
494                                 return r;
495                         }
496
497                         arg_register = r;
498                         break;
499
500                 case ARG_KEEP_UNIT:
501                         arg_keep_unit = true;
502                         break;
503
504                 case ARG_PERSONALITY:
505
506                         arg_personality = personality_from_string(optarg);
507                         if (arg_personality == 0xffffffffLU) {
508                                 log_error("Unknown or unsupported personality '%s'.", optarg);
509                                 return -EINVAL;
510                         }
511
512                         break;
513
514                 case '?':
515                         return -EINVAL;
516
517                 default:
518                         assert_not_reached("Unhandled option");
519                 }
520         }
521
522         if (arg_share_system)
523                 arg_register = false;
524
525         if (arg_boot && arg_share_system) {
526                 log_error("--boot and --share-system may not be combined.");
527                 return -EINVAL;
528         }
529
530         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531                 log_error("--keep-unit may not be used when invoked from a user session.");
532                 return -EINVAL;
533         }
534
535         if (arg_directory && arg_image) {
536                 log_error("--directory= and --image= may not be combined.");
537                 return -EINVAL;
538         }
539
540         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
541
542         return 1;
543 }
544
545 static int mount_all(const char *dest) {
546
547         typedef struct MountPoint {
548                 const char *what;
549                 const char *where;
550                 const char *type;
551                 const char *options;
552                 unsigned long flags;
553                 bool fatal;
554         } MountPoint;
555
556         static const MountPoint mount_table[] = {
557                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
558                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
559                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
560                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
561                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
562                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
565 #ifdef HAVE_SELINUX
566                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
567                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
568 #endif
569         };
570
571         unsigned k;
572         int r = 0;
573
574         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575                 _cleanup_free_ char *where = NULL;
576 #ifdef HAVE_SELINUX
577                 _cleanup_free_ char *options = NULL;
578 #endif
579                 const char *o;
580                 int t;
581
582                 where = strjoin(dest, "/", mount_table[k].where, NULL);
583                 if (!where)
584                         return log_oom();
585
586                 t = path_is_mount_point(where, true);
587                 if (t < 0) {
588                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
589
590                         if (r == 0)
591                                 r = t;
592
593                         continue;
594                 }
595
596                 /* Skip this entry if it is not a remount. */
597                 if (mount_table[k].what && t > 0)
598                         continue;
599
600                 mkdir_p(where, 0755);
601
602 #ifdef HAVE_SELINUX
603                 if (arg_selinux_apifs_context &&
604                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
606                         if (!options)
607                                 return log_oom();
608
609                         o = options;
610                 } else
611 #endif
612                         o = mount_table[k].options;
613
614
615                 if (mount(mount_table[k].what,
616                           where,
617                           mount_table[k].type,
618                           mount_table[k].flags,
619                           o) < 0 &&
620                     mount_table[k].fatal) {
621
622                         log_error("mount(%s) failed: %m", where);
623
624                         if (r == 0)
625                                 r = -errno;
626                 }
627         }
628
629         return r;
630 }
631
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
633         char **x, **y;
634
635         STRV_FOREACH_PAIR(x, y, l) {
636                 char *where;
637                 struct stat source_st, dest_st;
638                 int r;
639
640                 if (stat(*x, &source_st) < 0) {
641                         log_error("Failed to stat %s: %m", *x);
642                         return -errno;
643                 }
644
645                 where = strappenda(dest, *y);
646                 r = stat(where, &dest_st);
647                 if (r == 0) {
648                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
650                                                 *x, where);
651                                 return -EINVAL;
652                         }
653                 } else if (errno == ENOENT) {
654                         r = mkdir_parents_label(where, 0755);
655                         if (r < 0) {
656                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
657                                 return r;
658                         }
659                 } else {
660                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
661                         return -errno;
662                 }
663                 /* Create the mount point, but be conservative -- refuse to create block
664                 * and char devices. */
665                 if (S_ISDIR(source_st.st_mode))
666                         mkdir_label(where, 0755);
667                 else if (S_ISFIFO(source_st.st_mode))
668                         mkfifo(where, 0644);
669                 else if (S_ISSOCK(source_st.st_mode))
670                         mknod(where, 0644 | S_IFSOCK, 0);
671                 else if (S_ISREG(source_st.st_mode))
672                         touch(where);
673                 else {
674                         log_error("Refusing to create mountpoint for file: %s", *x);
675                         return -ENOTSUP;
676                 }
677
678                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679                         log_error("mount(%s) failed: %m", where);
680                         return -errno;
681                 }
682
683                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684                         log_error("mount(%s) failed: %m", where);
685                         return -errno;
686                 }
687         }
688
689         return 0;
690 }
691
692 static int setup_timezone(const char *dest) {
693         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
694         char *z, *y;
695         int r;
696
697         assert(dest);
698
699         /* Fix the timezone, if possible */
700         r = readlink_malloc("/etc/localtime", &p);
701         if (r < 0) {
702                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
703                 return 0;
704         }
705
706         z = path_startswith(p, "../usr/share/zoneinfo/");
707         if (!z)
708                 z = path_startswith(p, "/usr/share/zoneinfo/");
709         if (!z) {
710                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
711                 return 0;
712         }
713
714         where = strappend(dest, "/etc/localtime");
715         if (!where)
716                 return log_oom();
717
718         r = readlink_malloc(where, &q);
719         if (r >= 0) {
720                 y = path_startswith(q, "../usr/share/zoneinfo/");
721                 if (!y)
722                         y = path_startswith(q, "/usr/share/zoneinfo/");
723
724
725                 /* Already pointing to the right place? Then do nothing .. */
726                 if (y && streq(y, z))
727                         return 0;
728         }
729
730         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
731         if (!check)
732                 return log_oom();
733
734         if (access(check, F_OK) < 0) {
735                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
736                 return 0;
737         }
738
739         what = strappend("../usr/share/zoneinfo/", z);
740         if (!what)
741                 return log_oom();
742
743         unlink(where);
744         if (symlink(what, where) < 0) {
745                 log_error("Failed to correct timezone of container: %m");
746                 return 0;
747         }
748
749         return 0;
750 }
751
752 static int setup_resolv_conf(const char *dest) {
753         char _cleanup_free_ *where = NULL;
754
755         assert(dest);
756
757         if (arg_private_network)
758                 return 0;
759
760         /* Fix resolv.conf, if possible */
761         where = strappend(dest, "/etc/resolv.conf");
762         if (!where)
763                 return log_oom();
764
765         /* We don't really care for the results of this really. If it
766          * fails, it fails, but meh... */
767         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
768
769         return 0;
770 }
771
772 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
773
774         snprintf(s, 37,
775                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
776                  SD_ID128_FORMAT_VAL(id));
777
778         return s;
779 }
780
781 static int setup_boot_id(const char *dest) {
782         _cleanup_free_ char *from = NULL, *to = NULL;
783         sd_id128_t rnd = {};
784         char as_uuid[37];
785         int r;
786
787         assert(dest);
788
789         if (arg_share_system)
790                 return 0;
791
792         /* Generate a new randomized boot ID, so that each boot-up of
793          * the container gets a new one */
794
795         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
796         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
797         if (!from || !to)
798                 return log_oom();
799
800         r = sd_id128_randomize(&rnd);
801         if (r < 0) {
802                 log_error("Failed to generate random boot id: %s", strerror(-r));
803                 return r;
804         }
805
806         id128_format_as_uuid(rnd, as_uuid);
807
808         r = write_string_file(from, as_uuid);
809         if (r < 0) {
810                 log_error("Failed to write boot id: %s", strerror(-r));
811                 return r;
812         }
813
814         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
815                 log_error("Failed to bind mount boot id: %m");
816                 r = -errno;
817         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
818                 log_warning("Failed to make boot id read-only: %m");
819
820         unlink(from);
821         return r;
822 }
823
824 static int copy_devnodes(const char *dest) {
825
826         static const char devnodes[] =
827                 "null\0"
828                 "zero\0"
829                 "full\0"
830                 "random\0"
831                 "urandom\0"
832                 "tty\0";
833
834         const char *d;
835         int r = 0;
836         _cleanup_umask_ mode_t u;
837
838         assert(dest);
839
840         u = umask(0000);
841
842         NULSTR_FOREACH(d, devnodes) {
843                 _cleanup_free_ char *from = NULL, *to = NULL;
844                 struct stat st;
845
846                 from = strappend("/dev/", d);
847                 to = strjoin(dest, "/dev/", d, NULL);
848                 if (!from || !to)
849                         return log_oom();
850
851                 if (stat(from, &st) < 0) {
852
853                         if (errno != ENOENT) {
854                                 log_error("Failed to stat %s: %m", from);
855                                 return -errno;
856                         }
857
858                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
859
860                         log_error("%s is not a char or block device, cannot copy", from);
861                         return -EIO;
862
863                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
864
865                         log_error("mknod(%s) failed: %m", dest);
866                         return  -errno;
867                 }
868         }
869
870         return r;
871 }
872
873 static int setup_ptmx(const char *dest) {
874         _cleanup_free_ char *p = NULL;
875
876         p = strappend(dest, "/dev/ptmx");
877         if (!p)
878                 return log_oom();
879
880         if (symlink("pts/ptmx", p) < 0) {
881                 log_error("Failed to create /dev/ptmx symlink: %m");
882                 return -errno;
883         }
884
885         return 0;
886 }
887
888 static int setup_dev_console(const char *dest, const char *console) {
889         _cleanup_umask_ mode_t u;
890         const char *to;
891         struct stat st;
892         int r;
893
894         assert(dest);
895         assert(console);
896
897         u = umask(0000);
898
899         if (stat("/dev/null", &st) < 0) {
900                 log_error("Failed to stat /dev/null: %m");
901                 return -errno;
902         }
903
904         r = chmod_and_chown(console, 0600, 0, 0);
905         if (r < 0) {
906                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
907                 return r;
908         }
909
910         /* We need to bind mount the right tty to /dev/console since
911          * ptys can only exist on pts file systems. To have something
912          * to bind mount things on we create a device node first, and
913          * use /dev/null for that since we the cgroups device policy
914          * allows us to create that freely, while we cannot create
915          * /dev/console. (Note that the major minor doesn't actually
916          * matter here, since we mount it over anyway). */
917
918         to = strappenda(dest, "/dev/console");
919         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
920                 log_error("mknod() for /dev/console failed: %m");
921                 return -errno;
922         }
923
924         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
925                 log_error("Bind mount for /dev/console failed: %m");
926                 return -errno;
927         }
928
929         return 0;
930 }
931
932 static int setup_kmsg(const char *dest, int kmsg_socket) {
933         _cleanup_free_ char *from = NULL, *to = NULL;
934         int r, fd, k;
935         _cleanup_umask_ mode_t u;
936         union {
937                 struct cmsghdr cmsghdr;
938                 uint8_t buf[CMSG_SPACE(sizeof(int))];
939         } control = {};
940         struct msghdr mh = {
941                 .msg_control = &control,
942                 .msg_controllen = sizeof(control),
943         };
944         struct cmsghdr *cmsg;
945
946         assert(dest);
947         assert(kmsg_socket >= 0);
948
949         u = umask(0000);
950
951         /* We create the kmsg FIFO as /dev/kmsg, but immediately
952          * delete it after bind mounting it to /proc/kmsg. While FIFOs
953          * on the reading side behave very similar to /proc/kmsg,
954          * their writing side behaves differently from /dev/kmsg in
955          * that writing blocks when nothing is reading. In order to
956          * avoid any problems with containers deadlocking due to this
957          * we simply make /dev/kmsg unavailable to the container. */
958         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
959             asprintf(&to, "%s/proc/kmsg", dest) < 0)
960                 return log_oom();
961
962         if (mkfifo(from, 0600) < 0) {
963                 log_error("mkfifo() for /dev/kmsg failed: %m");
964                 return -errno;
965         }
966
967         r = chmod_and_chown(from, 0600, 0, 0);
968         if (r < 0) {
969                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
970                 return r;
971         }
972
973         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
974                 log_error("Bind mount for /proc/kmsg failed: %m");
975                 return -errno;
976         }
977
978         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
979         if (fd < 0) {
980                 log_error("Failed to open fifo: %m");
981                 return -errno;
982         }
983
984         cmsg = CMSG_FIRSTHDR(&mh);
985         cmsg->cmsg_level = SOL_SOCKET;
986         cmsg->cmsg_type = SCM_RIGHTS;
987         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
988         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
989
990         mh.msg_controllen = cmsg->cmsg_len;
991
992         /* Store away the fd in the socket, so that it stays open as
993          * long as we run the child */
994         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
995         safe_close(fd);
996
997         if (k < 0) {
998                 log_error("Failed to send FIFO fd: %m");
999                 return -errno;
1000         }
1001
1002         /* And now make the FIFO unavailable as /dev/kmsg... */
1003         unlink(from);
1004         return 0;
1005 }
1006
1007 static int setup_hostname(void) {
1008
1009         if (arg_share_system)
1010                 return 0;
1011
1012         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1013                 return -errno;
1014
1015         return 0;
1016 }
1017
1018 static int setup_journal(const char *directory) {
1019         sd_id128_t machine_id, this_id;
1020         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1021         char *id;
1022         int r;
1023
1024         p = strappend(directory, "/etc/machine-id");
1025         if (!p)
1026                 return log_oom();
1027
1028         r = read_one_line_file(p, &b);
1029         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1030                 return 0;
1031         else if (r < 0) {
1032                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1033                 return r;
1034         }
1035
1036         id = strstrip(b);
1037         if (isempty(id) && arg_link_journal == LINK_AUTO)
1038                 return 0;
1039
1040         /* Verify validity */
1041         r = sd_id128_from_string(id, &machine_id);
1042         if (r < 0) {
1043                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1044                 return r;
1045         }
1046
1047         r = sd_id128_get_machine(&this_id);
1048         if (r < 0) {
1049                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1050                 return r;
1051         }
1052
1053         if (sd_id128_equal(machine_id, this_id)) {
1054                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1055                          "Host and machine ids are equal (%s): refusing to link journals", id);
1056                 if (arg_link_journal == LINK_AUTO)
1057                         return 0;
1058                 return
1059                         -EEXIST;
1060         }
1061
1062         if (arg_link_journal == LINK_NO)
1063                 return 0;
1064
1065         free(p);
1066         p = strappend("/var/log/journal/", id);
1067         q = strjoin(directory, "/var/log/journal/", id, NULL);
1068         if (!p || !q)
1069                 return log_oom();
1070
1071         if (path_is_mount_point(p, false) > 0) {
1072                 if (arg_link_journal != LINK_AUTO) {
1073                         log_error("%s: already a mount point, refusing to use for journal", p);
1074                         return -EEXIST;
1075                 }
1076
1077                 return 0;
1078         }
1079
1080         if (path_is_mount_point(q, false) > 0) {
1081                 if (arg_link_journal != LINK_AUTO) {
1082                         log_error("%s: already a mount point, refusing to use for journal", q);
1083                         return -EEXIST;
1084                 }
1085
1086                 return 0;
1087         }
1088
1089         r = readlink_and_make_absolute(p, &d);
1090         if (r >= 0) {
1091                 if ((arg_link_journal == LINK_GUEST ||
1092                      arg_link_journal == LINK_AUTO) &&
1093                     path_equal(d, q)) {
1094
1095                         r = mkdir_p(q, 0755);
1096                         if (r < 0)
1097                                 log_warning("failed to create directory %s: %m", q);
1098                         return 0;
1099                 }
1100
1101                 if (unlink(p) < 0) {
1102                         log_error("Failed to remove symlink %s: %m", p);
1103                         return -errno;
1104                 }
1105         } else if (r == -EINVAL) {
1106
1107                 if (arg_link_journal == LINK_GUEST &&
1108                     rmdir(p) < 0) {
1109
1110                         if (errno == ENOTDIR) {
1111                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1112                                 return r;
1113                         } else {
1114                                 log_error("Failed to remove %s: %m", p);
1115                                 return -errno;
1116                         }
1117                 }
1118         } else if (r != -ENOENT) {
1119                 log_error("readlink(%s) failed: %m", p);
1120                 return r;
1121         }
1122
1123         if (arg_link_journal == LINK_GUEST) {
1124
1125                 if (symlink(q, p) < 0) {
1126                         log_error("Failed to symlink %s to %s: %m", q, p);
1127                         return -errno;
1128                 }
1129
1130                 r = mkdir_p(q, 0755);
1131                 if (r < 0)
1132                         log_warning("failed to create directory %s: %m", q);
1133                 return 0;
1134         }
1135
1136         if (arg_link_journal == LINK_HOST) {
1137                 r = mkdir_p(p, 0755);
1138                 if (r < 0) {
1139                         log_error("Failed to create %s: %m", p);
1140                         return r;
1141                 }
1142
1143         } else if (access(p, F_OK) < 0)
1144                 return 0;
1145
1146         if (dir_is_empty(q) == 0) {
1147                 log_error("%s not empty.", q);
1148                 return -ENOTEMPTY;
1149         }
1150
1151         r = mkdir_p(q, 0755);
1152         if (r < 0) {
1153                 log_error("Failed to create %s: %m", q);
1154                 return r;
1155         }
1156
1157         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1158                 log_error("Failed to bind mount journal from host into guest: %m");
1159                 return -errno;
1160         }
1161
1162         return 0;
1163 }
1164
1165 static int setup_kdbus(const char *dest, const char *path) {
1166         const char *p;
1167
1168         if (!path)
1169                 return 0;
1170
1171         p = strappenda(dest, "/dev/kdbus");
1172         if (mkdir(p, 0755) < 0) {
1173                 log_error("Failed to create kdbus path: %m");
1174                 return  -errno;
1175         }
1176
1177         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1178                 log_error("Failed to mount kdbus domain path: %m");
1179                 return -errno;
1180         }
1181
1182         return 0;
1183 }
1184
1185 static int drop_capabilities(void) {
1186         return capability_bounding_set_drop(~arg_retain, false);
1187 }
1188
1189 static int register_machine(pid_t pid) {
1190         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1191         _cleanup_bus_unref_ sd_bus *bus = NULL;
1192         int r;
1193
1194         if (!arg_register)
1195                 return 0;
1196
1197         r = sd_bus_default_system(&bus);
1198         if (r < 0) {
1199                 log_error("Failed to open system bus: %s", strerror(-r));
1200                 return r;
1201         }
1202
1203         if (arg_keep_unit) {
1204                 r = sd_bus_call_method(
1205                                 bus,
1206                                 "org.freedesktop.machine1",
1207                                 "/org/freedesktop/machine1",
1208                                 "org.freedesktop.machine1.Manager",
1209                                 "RegisterMachine",
1210                                 &error,
1211                                 NULL,
1212                                 "sayssus",
1213                                 arg_machine,
1214                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1215                                 "nspawn",
1216                                 "container",
1217                                 (uint32_t) pid,
1218                                 strempty(arg_directory));
1219         } else {
1220                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1221
1222                 r = sd_bus_message_new_method_call(
1223                                 bus,
1224                                 &m,
1225                                 "org.freedesktop.machine1",
1226                                 "/org/freedesktop/machine1",
1227                                 "org.freedesktop.machine1.Manager",
1228                                 "CreateMachine");
1229                 if (r < 0) {
1230                         log_error("Failed to create message: %s", strerror(-r));
1231                         return r;
1232                 }
1233
1234                 r = sd_bus_message_append(
1235                                 m,
1236                                 "sayssus",
1237                                 arg_machine,
1238                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1239                                 "nspawn",
1240                                 "container",
1241                                 (uint32_t) pid,
1242                                 strempty(arg_directory));
1243                 if (r < 0) {
1244                         log_error("Failed to append message arguments: %s", strerror(-r));
1245                         return r;
1246                 }
1247
1248                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1249                 if (r < 0) {
1250                         log_error("Failed to open container: %s", strerror(-r));
1251                         return r;
1252                 }
1253
1254                 if (!isempty(arg_slice)) {
1255                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1256                         if (r < 0) {
1257                                 log_error("Failed to append slice: %s", strerror(-r));
1258                                 return r;
1259                         }
1260                 }
1261
1262                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1263                 if (r < 0) {
1264                         log_error("Failed to add device policy: %s", strerror(-r));
1265                         return r;
1266                 }
1267
1268                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1269                                           /* Allow the container to
1270                                            * access and create the API
1271                                            * device nodes, so that
1272                                            * PrivateDevices= in the
1273                                            * container can work
1274                                            * fine */
1275                                           "/dev/null", "rwm",
1276                                           "/dev/zero", "rwm",
1277                                           "/dev/full", "rwm",
1278                                           "/dev/random", "rwm",
1279                                           "/dev/urandom", "rwm",
1280                                           "/dev/tty", "rwm",
1281                                           /* Allow the container
1282                                            * access to ptys. However,
1283                                            * do not permit the
1284                                            * container to ever create
1285                                            * these device nodes. */
1286                                           "/dev/pts/ptmx", "rw",
1287                                           "char-pts", "rw",
1288                                           /* Allow the container
1289                                            * access to all kdbus
1290                                            * devices. Again, the
1291                                            * container cannot create
1292                                            * these nodes, only use
1293                                            * them. We use a pretty
1294                                            * open match here, so that
1295                                            * the kernel API can still
1296                                            * change. */
1297                                           "char-kdbus", "rw",
1298                                           "char-kdbus/*", "rw");
1299                 if (r < 0) {
1300                         log_error("Failed to add device whitelist: %s", strerror(-r));
1301                         return r;
1302                 }
1303
1304                 r = sd_bus_message_close_container(m);
1305                 if (r < 0) {
1306                         log_error("Failed to close container: %s", strerror(-r));
1307                         return r;
1308                 }
1309
1310                 r = sd_bus_call(bus, m, 0, &error, NULL);
1311         }
1312
1313         if (r < 0) {
1314                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1315                 return r;
1316         }
1317
1318         return 0;
1319 }
1320
1321 static int terminate_machine(pid_t pid) {
1322         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1323         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1324         _cleanup_bus_unref_ sd_bus *bus = NULL;
1325         const char *path;
1326         int r;
1327
1328         if (!arg_register)
1329                 return 0;
1330
1331         r = sd_bus_default_system(&bus);
1332         if (r < 0) {
1333                 log_error("Failed to open system bus: %s", strerror(-r));
1334                 return r;
1335         }
1336
1337         r = sd_bus_call_method(
1338                         bus,
1339                         "org.freedesktop.machine1",
1340                         "/org/freedesktop/machine1",
1341                         "org.freedesktop.machine1.Manager",
1342                         "GetMachineByPID",
1343                         &error,
1344                         &reply,
1345                         "u",
1346                         (uint32_t) pid);
1347         if (r < 0) {
1348                 /* Note that the machine might already have been
1349                  * cleaned up automatically, hence don't consider it a
1350                  * failure if we cannot get the machine object. */
1351                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1352                 return 0;
1353         }
1354
1355         r = sd_bus_message_read(reply, "o", &path);
1356         if (r < 0)
1357                 return bus_log_parse_error(r);
1358
1359         r = sd_bus_call_method(
1360                         bus,
1361                         "org.freedesktop.machine1",
1362                         path,
1363                         "org.freedesktop.machine1.Machine",
1364                         "Terminate",
1365                         &error,
1366                         NULL,
1367                         NULL);
1368         if (r < 0) {
1369                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1370                 return 0;
1371         }
1372
1373         return 0;
1374 }
1375
1376 static int reset_audit_loginuid(void) {
1377         _cleanup_free_ char *p = NULL;
1378         int r;
1379
1380         if (arg_share_system)
1381                 return 0;
1382
1383         r = read_one_line_file("/proc/self/loginuid", &p);
1384         if (r == -ENOENT)
1385                 return 0;
1386         if (r < 0) {
1387                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1388                 return r;
1389         }
1390
1391         /* Already reset? */
1392         if (streq(p, "4294967295"))
1393                 return 0;
1394
1395         r = write_string_file("/proc/self/loginuid", "4294967295");
1396         if (r < 0) {
1397                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1398                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1399                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1400                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1401                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1402
1403                 sleep(5);
1404         }
1405
1406         return 0;
1407 }
1408
1409 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1410
1411 static int get_mac(struct ether_addr *mac) {
1412         int r;
1413
1414         uint8_t result[8];
1415         size_t l, sz;
1416         uint8_t *v;
1417
1418         l = strlen(arg_machine);
1419         sz = sizeof(sd_id128_t) + l;
1420         v = alloca(sz);
1421
1422         /* fetch some persistent data unique to the host */
1423         r = sd_id128_get_machine((sd_id128_t*) v);
1424         if (r < 0)
1425                 return r;
1426
1427         /* combine with some data unique (on this host) to this
1428          * container instance */
1429         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1430
1431         /* Let's hash the host machine ID plus the container name. We
1432          * use a fixed, but originally randomly created hash key here. */
1433         siphash24(result, v, sz, HASH_KEY.bytes);
1434
1435         assert_cc(ETH_ALEN <= sizeof(result));
1436         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1437
1438         /* see eth_random_addr in the kernel */
1439         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1440         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1441
1442         return 0;
1443 }
1444
1445 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1446         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1447         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1448         struct ether_addr mac;
1449         int r;
1450
1451         if (!arg_private_network)
1452                 return 0;
1453
1454         if (!arg_network_veth)
1455                 return 0;
1456
1457         /* Use two different interface name prefixes depending whether
1458          * we are in bridge mode or not. */
1459         if (arg_network_bridge)
1460                 memcpy(iface_name, "vb-", 3);
1461         else
1462                 memcpy(iface_name, "ve-", 3);
1463         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1464
1465         r = get_mac(&mac);
1466         if (r < 0) {
1467                 log_error("Failed to generate predictable MAC address for host0");
1468                 return r;
1469         }
1470
1471         r = sd_rtnl_open(&rtnl, 0);
1472         if (r < 0) {
1473                 log_error("Failed to connect to netlink: %s", strerror(-r));
1474                 return r;
1475         }
1476
1477         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1478         if (r < 0) {
1479                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1480                 return r;
1481         }
1482
1483         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1484         if (r < 0) {
1485                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1486                 return r;
1487         }
1488
1489         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1490         if (r < 0) {
1491                 log_error("Failed to open netlink container: %s", strerror(-r));
1492                 return r;
1493         }
1494
1495         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1496         if (r < 0) {
1497                 log_error("Failed to open netlink container: %s", strerror(-r));
1498                 return r;
1499         }
1500
1501         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1502         if (r < 0) {
1503                 log_error("Failed to open netlink container: %s", strerror(-r));
1504                 return r;
1505         }
1506
1507         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1508         if (r < 0) {
1509                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1510                 return r;
1511         }
1512
1513         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1514         if (r < 0) {
1515                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1516                 return r;
1517         }
1518
1519         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1520         if (r < 0) {
1521                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1522                 return r;
1523         }
1524
1525         r = sd_rtnl_message_close_container(m);
1526         if (r < 0) {
1527                 log_error("Failed to close netlink container: %s", strerror(-r));
1528                 return r;
1529         }
1530
1531         r = sd_rtnl_message_close_container(m);
1532         if (r < 0) {
1533                 log_error("Failed to close netlink container: %s", strerror(-r));
1534                 return r;
1535         }
1536
1537         r = sd_rtnl_message_close_container(m);
1538         if (r < 0) {
1539                 log_error("Failed to close netlink container: %s", strerror(-r));
1540                 return r;
1541         }
1542
1543         r = sd_rtnl_call(rtnl, m, 0, NULL);
1544         if (r < 0) {
1545                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1546                 return r;
1547         }
1548
1549         return 0;
1550 }
1551
1552 static int setup_bridge(const char veth_name[]) {
1553         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1554         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1555         int r, bridge;
1556
1557         if (!arg_private_network)
1558                 return 0;
1559
1560         if (!arg_network_veth)
1561                 return 0;
1562
1563         if (!arg_network_bridge)
1564                 return 0;
1565
1566         bridge = (int) if_nametoindex(arg_network_bridge);
1567         if (bridge <= 0) {
1568                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1569                 return -errno;
1570         }
1571
1572         r = sd_rtnl_open(&rtnl, 0);
1573         if (r < 0) {
1574                 log_error("Failed to connect to netlink: %s", strerror(-r));
1575                 return r;
1576         }
1577
1578         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1579         if (r < 0) {
1580                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1581                 return r;
1582         }
1583
1584         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1585         if (r < 0) {
1586                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1587                 return r;
1588         }
1589
1590         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1591         if (r < 0) {
1592                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1593                 return r;
1594         }
1595
1596         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1597         if (r < 0) {
1598                 log_error("Failed to add netlink master field: %s", strerror(-r));
1599                 return r;
1600         }
1601
1602         r = sd_rtnl_call(rtnl, m, 0, NULL);
1603         if (r < 0) {
1604                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1605                 return r;
1606         }
1607
1608         return 0;
1609 }
1610
1611 static int parse_interface(struct udev *udev, const char *name) {
1612         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1613         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1614         int ifi;
1615
1616         ifi = (int) if_nametoindex(name);
1617         if (ifi <= 0) {
1618                 log_error("Failed to resolve interface %s: %m", name);
1619                 return -errno;
1620         }
1621
1622         sprintf(ifi_str, "n%i", ifi);
1623         d = udev_device_new_from_device_id(udev, ifi_str);
1624         if (!d) {
1625                 log_error("Failed to get udev device for interface %s: %m", name);
1626                 return -errno;
1627         }
1628
1629         if (udev_device_get_is_initialized(d) <= 0) {
1630                 log_error("Network interface %s is not initialized yet.", name);
1631                 return -EBUSY;
1632         }
1633
1634         return ifi;
1635 }
1636
1637 static int move_network_interfaces(pid_t pid) {
1638         _cleanup_udev_unref_ struct udev *udev = NULL;
1639         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1640         char **i;
1641         int r;
1642
1643         if (!arg_private_network)
1644                 return 0;
1645
1646         if (strv_isempty(arg_network_interfaces))
1647                 return 0;
1648
1649         r = sd_rtnl_open(&rtnl, 0);
1650         if (r < 0) {
1651                 log_error("Failed to connect to netlink: %s", strerror(-r));
1652                 return r;
1653         }
1654
1655         udev = udev_new();
1656         if (!udev) {
1657                 log_error("Failed to connect to udev.");
1658                 return -ENOMEM;
1659         }
1660
1661         STRV_FOREACH(i, arg_network_interfaces) {
1662                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1663                 int ifi;
1664
1665                 ifi = parse_interface(udev, *i);
1666                 if (ifi < 0)
1667                         return ifi;
1668
1669                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1670                 if (r < 0) {
1671                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1672                         return r;
1673                 }
1674
1675                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1676                 if (r < 0) {
1677                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1678                         return r;
1679                 }
1680
1681                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1682                 if (r < 0) {
1683                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1684                         return r;
1685                 }
1686         }
1687
1688         return 0;
1689 }
1690
1691 static int setup_macvlan(pid_t pid) {
1692         _cleanup_udev_unref_ struct udev *udev = NULL;
1693         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1694         char **i;
1695         int r;
1696
1697         if (!arg_private_network)
1698                 return 0;
1699
1700         if (strv_isempty(arg_network_macvlan))
1701                 return 0;
1702
1703         r = sd_rtnl_open(&rtnl, 0);
1704         if (r < 0) {
1705                 log_error("Failed to connect to netlink: %s", strerror(-r));
1706                 return r;
1707         }
1708
1709         udev = udev_new();
1710         if (!udev) {
1711                 log_error("Failed to connect to udev.");
1712                 return -ENOMEM;
1713         }
1714
1715         STRV_FOREACH(i, arg_network_macvlan) {
1716                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1717                 _cleanup_free_ char *n = NULL;
1718                 int ifi;
1719
1720                 ifi = parse_interface(udev, *i);
1721                 if (ifi < 0)
1722                         return ifi;
1723
1724                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1725                 if (r < 0) {
1726                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1727                         return r;
1728                 }
1729
1730                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1731                 if (r < 0) {
1732                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1733                         return r;
1734                 }
1735
1736                 n = strappend("mv-", *i);
1737                 if (!n)
1738                         return log_oom();
1739
1740                 strshorten(n, IFNAMSIZ-1);
1741
1742                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1743                 if (r < 0) {
1744                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1745                         return r;
1746                 }
1747
1748                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1749                 if (r < 0) {
1750                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1751                         return r;
1752                 }
1753
1754                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1755                 if (r < 0) {
1756                         log_error("Failed to open netlink container: %s", strerror(-r));
1757                         return r;
1758                 }
1759
1760                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1761                 if (r < 0) {
1762                         log_error("Failed to open netlink container: %s", strerror(-r));
1763                         return r;
1764                 }
1765
1766                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1767                 if (r < 0) {
1768                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1769                         return r;
1770                 }
1771
1772                 r = sd_rtnl_message_close_container(m);
1773                 if (r < 0) {
1774                         log_error("Failed to close netlink container: %s", strerror(-r));
1775                         return r;
1776                 }
1777
1778                 r = sd_rtnl_message_close_container(m);
1779                 if (r < 0) {
1780                         log_error("Failed to close netlink container: %s", strerror(-r));
1781                         return r;
1782                 }
1783
1784                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1785                 if (r < 0) {
1786                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1787                         return r;
1788                 }
1789         }
1790
1791         return 0;
1792 }
1793
1794 static int audit_still_doesnt_work_in_containers(void) {
1795
1796 #ifdef HAVE_SECCOMP
1797         scmp_filter_ctx seccomp;
1798         int r;
1799
1800         /*
1801            Audit is broken in containers, much of the userspace audit
1802            hookup will fail if running inside a container. We don't
1803            care and just turn off creation of audit sockets.
1804
1805            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1806            with EAFNOSUPPORT which audit userspace uses as indication
1807            that audit is disabled in the kernel.
1808          */
1809
1810         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1811         if (!seccomp)
1812                 return log_oom();
1813
1814         r = seccomp_add_secondary_archs(seccomp);
1815         if (r < 0) {
1816                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1817                 goto finish;
1818         }
1819
1820         r = seccomp_rule_add(
1821                         seccomp,
1822                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1823                         SCMP_SYS(socket),
1824                         2,
1825                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1826                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1827         if (r < 0) {
1828                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1829                 goto finish;
1830         }
1831
1832         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1833         if (r < 0) {
1834                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1835                 goto finish;
1836         }
1837
1838         r = seccomp_load(seccomp);
1839         if (r < 0)
1840                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1841
1842 finish:
1843         seccomp_release(seccomp);
1844         return r;
1845 #else
1846         return 0;
1847 #endif
1848
1849 }
1850
1851 static int setup_image(char **device_path, int *loop_nr) {
1852         struct loop_info64 info = {
1853                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1854         };
1855         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1856         _cleanup_free_ char* loopdev = NULL;
1857         struct stat st;
1858         int r, nr;
1859
1860         assert(device_path);
1861         assert(loop_nr);
1862
1863         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1864         if (fd < 0) {
1865                 log_error("Failed to open %s: %m", arg_image);
1866                 return -errno;
1867         }
1868
1869         if (fstat(fd, &st) < 0) {
1870                 log_error("Failed to stat %s: %m", arg_image);
1871                 return -errno;
1872         }
1873
1874         if (S_ISBLK(st.st_mode)) {
1875                 char *p;
1876
1877                 p = strdup(arg_image);
1878                 if (!p)
1879                         return log_oom();
1880
1881                 *device_path = p;
1882
1883                 *loop_nr = -1;
1884
1885                 r = fd;
1886                 fd = -1;
1887
1888                 return r;
1889         }
1890
1891         if (!S_ISREG(st.st_mode)) {
1892                 log_error("%s is not a regular file or block device: %m", arg_image);
1893                 return -EINVAL;
1894         }
1895
1896         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1897         if (control < 0) {
1898                 log_error("Failed to open /dev/loop-control: %m");
1899                 return -errno;
1900         }
1901
1902         nr = ioctl(control, LOOP_CTL_GET_FREE);
1903         if (nr < 0) {
1904                 log_error("Failed to allocate loop device: %m");
1905                 return -errno;
1906         }
1907
1908         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1909                 return log_oom();
1910
1911         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1912         if (loop < 0) {
1913                 log_error("Failed to open loop device %s: %m", loopdev);
1914                 return -errno;
1915         }
1916
1917         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1918                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1919                 return -errno;
1920         }
1921
1922         if (arg_read_only)
1923                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1924
1925         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1926                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1927                 return -errno;
1928         }
1929
1930         *device_path = loopdev;
1931         loopdev = NULL;
1932
1933         *loop_nr = nr;
1934
1935         r = loop;
1936         loop = -1;
1937
1938         return r;
1939 }
1940
1941 static int dissect_image(
1942                 int fd,
1943                 char **root_device, bool *root_device_rw,
1944                 char **home_device, bool *home_device_rw,
1945                 char **srv_device, bool *srv_device_rw,
1946                 bool *secondary) {
1947
1948 #ifdef HAVE_BLKID
1949         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1950         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1951         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1952         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1953         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1954         _cleanup_udev_unref_ struct udev *udev = NULL;
1955         struct udev_list_entry *first, *item;
1956         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1957         const char *pttype = NULL;
1958         blkid_partlist pl;
1959         struct stat st;
1960         int r;
1961
1962         assert(fd >= 0);
1963         assert(root_device);
1964         assert(home_device);
1965         assert(srv_device);
1966         assert(secondary);
1967
1968         b = blkid_new_probe();
1969         if (!b)
1970                 return log_oom();
1971
1972         errno = 0;
1973         r = blkid_probe_set_device(b, fd, 0, 0);
1974         if (r != 0) {
1975                 if (errno == 0)
1976                         return log_oom();
1977
1978                 log_error("Failed to set device on blkid probe: %m");
1979                 return -errno;
1980         }
1981
1982         blkid_probe_enable_partitions(b, 1);
1983         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1984
1985         errno = 0;
1986         r = blkid_do_safeprobe(b);
1987         if (r == -2 || r == 1) {
1988                 log_error("Failed to identify any partition table on %s.\n"
1989                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1990                 return -EINVAL;
1991         } else if (r != 0) {
1992                 if (errno == 0)
1993                         errno = EIO;
1994                 log_error("Failed to probe: %m");
1995                 return -errno;
1996         }
1997
1998         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1999         if (!streq_ptr(pttype, "gpt")) {
2000                 log_error("Image %s does not carry a GUID Partition Table.\n"
2001                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2002                 return -EINVAL;
2003         }
2004
2005         errno = 0;
2006         pl = blkid_probe_get_partitions(b);
2007         if (!pl) {
2008                 if (errno == 0)
2009                         return log_oom();
2010
2011                 log_error("Failed to list partitions of %s", arg_image);
2012                 return -errno;
2013         }
2014
2015         udev = udev_new();
2016         if (!udev)
2017                 return log_oom();
2018
2019         if (fstat(fd, &st) < 0) {
2020                 log_error("Failed to stat block device: %m");
2021                 return -errno;
2022         }
2023
2024         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2025         if (!d)
2026                 return log_oom();
2027
2028         e = udev_enumerate_new(udev);
2029         if (!e)
2030                 return log_oom();
2031
2032         r = udev_enumerate_add_match_parent(e, d);
2033         if (r < 0)
2034                 return log_oom();
2035
2036         r = udev_enumerate_scan_devices(e);
2037         if (r < 0) {
2038                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2039                 return r;
2040         }
2041
2042         first = udev_enumerate_get_list_entry(e);
2043         udev_list_entry_foreach(item, first) {
2044                 _cleanup_udev_device_unref_ struct udev_device *q;
2045                 const char *stype, *node;
2046                 unsigned long long flags;
2047                 sd_id128_t type_id;
2048                 blkid_partition pp;
2049                 dev_t qn;
2050                 int nr;
2051
2052                 errno = 0;
2053                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2054                 if (!q) {
2055                         if (!errno)
2056                                 errno = ENOMEM;
2057
2058                         log_error("Failed to get partition device of %s: %m", arg_image);
2059                         return -errno;
2060                 }
2061
2062                 qn = udev_device_get_devnum(q);
2063                 if (major(qn) == 0)
2064                         continue;
2065
2066                 if (st.st_rdev == qn)
2067                         continue;
2068
2069                 node = udev_device_get_devnode(q);
2070                 if (!node)
2071                         continue;
2072
2073                 pp = blkid_partlist_devno_to_partition(pl, qn);
2074                 if (!pp)
2075                         continue;
2076
2077                 flags = blkid_partition_get_flags(pp);
2078                 if (flags & GPT_FLAG_NO_AUTO)
2079                         continue;
2080
2081                 nr = blkid_partition_get_partno(pp);
2082                 if (nr < 0)
2083                         continue;
2084
2085                 stype = blkid_partition_get_type_string(pp);
2086                 if (!stype)
2087                         continue;
2088
2089                 if (sd_id128_from_string(stype, &type_id) < 0)
2090                         continue;
2091
2092                 if (sd_id128_equal(type_id, GPT_HOME)) {
2093
2094                         if (home && nr >= home_nr)
2095                                 continue;
2096
2097                         home_nr = nr;
2098                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2099
2100                         free(home);
2101                         home = strdup(node);
2102                         if (!home)
2103                                 return log_oom();
2104                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2105
2106                         if (srv && nr >= srv_nr)
2107                                 continue;
2108
2109                         srv_nr = nr;
2110                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2111
2112                         free(srv);
2113                         srv = strdup(node);
2114                         if (!srv)
2115                                 return log_oom();
2116                 }
2117 #ifdef GPT_ROOT_NATIVE
2118                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2119
2120                         if (root && nr >= root_nr)
2121                                 continue;
2122
2123                         root_nr = nr;
2124                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2125
2126                         free(root);
2127                         root = strdup(node);
2128                         if (!root)
2129                                 return log_oom();
2130                 }
2131 #endif
2132 #ifdef GPT_ROOT_SECONDARY
2133                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2134
2135                         if (secondary_root && nr >= secondary_root_nr)
2136                                 continue;
2137
2138                         secondary_root_nr = nr;
2139                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2140
2141
2142                         free(secondary_root);
2143                         secondary_root = strdup(node);
2144                         if (!secondary_root)
2145                                 return log_oom();
2146                 }
2147 #endif
2148         }
2149
2150         if (!root && !secondary_root) {
2151                 log_error("Failed to identify root partition in disk image %s.\n"
2152                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2153                 return -EINVAL;
2154         }
2155
2156         if (root) {
2157                 *root_device = root;
2158                 root = NULL;
2159
2160                 *root_device_rw = root_rw;
2161                 *secondary = false;
2162         } else if (secondary_root) {
2163                 *root_device = secondary_root;
2164                 secondary_root = NULL;
2165
2166                 *root_device_rw = secondary_root_rw;
2167                 *secondary = true;
2168         }
2169
2170         if (home) {
2171                 *home_device = home;
2172                 home = NULL;
2173
2174                 *home_device_rw = home_rw;
2175         }
2176
2177         if (srv) {
2178                 *srv_device = srv;
2179                 srv = NULL;
2180
2181                 *srv_device_rw = srv_rw;
2182         }
2183
2184         return 0;
2185 #else
2186         log_error("--image= is not supported, compiled without blkid support.");
2187         return -ENOTSUP;
2188 #endif
2189 }
2190
2191 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2192 #ifdef HAVE_BLKID
2193         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2194         const char *fstype, *p;
2195         int r;
2196
2197         assert(what);
2198         assert(where);
2199
2200         if (arg_read_only)
2201                 rw = false;
2202
2203         if (directory)
2204                 p = strappenda(where, directory);
2205         else
2206                 p = where;
2207
2208         errno = 0;
2209         b = blkid_new_probe_from_filename(what);
2210         if (!b) {
2211                 if (errno == 0)
2212                         return log_oom();
2213                 log_error("Failed to allocate prober for %s: %m", what);
2214                 return -errno;
2215         }
2216
2217         blkid_probe_enable_superblocks(b, 1);
2218         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2219
2220         errno = 0;
2221         r = blkid_do_safeprobe(b);
2222         if (r == -1 || r == 1) {
2223                 log_error("Cannot determine file system type of %s", what);
2224                 return -EINVAL;
2225         } else if (r != 0) {
2226                 if (errno == 0)
2227                         errno = EIO;
2228                 log_error("Failed to probe %s: %m", what);
2229                 return -errno;
2230         }
2231
2232         errno = 0;
2233         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2234                 if (errno == 0)
2235                         errno = EINVAL;
2236                 log_error("Failed to determine file system type of %s", what);
2237                 return -errno;
2238         }
2239
2240         if (streq(fstype, "crypto_LUKS")) {
2241                 log_error("nspawn currently does not support LUKS disk images.");
2242                 return -ENOTSUP;
2243         }
2244
2245         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2246                 log_error("Failed to mount %s: %m", what);
2247                 return -errno;
2248         }
2249
2250         return 0;
2251 #else
2252         log_error("--image= is not supported, compiled without blkid support.");
2253         return -ENOTSUP;
2254 #endif
2255 }
2256
2257 static int mount_devices(
2258                 const char *where,
2259                 const char *root_device, bool root_device_rw,
2260                 const char *home_device, bool home_device_rw,
2261                 const char *srv_device, bool srv_device_rw) {
2262         int r;
2263
2264         assert(where);
2265
2266         if (root_device) {
2267                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2268                 if (r < 0) {
2269                         log_error("Failed to mount root directory: %s", strerror(-r));
2270                         return r;
2271                 }
2272         }
2273
2274         if (home_device) {
2275                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2276                 if (r < 0) {
2277                         log_error("Failed to mount home directory: %s", strerror(-r));
2278                         return r;
2279                 }
2280         }
2281
2282         if (srv_device) {
2283                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2284                 if (r < 0) {
2285                         log_error("Failed to mount server data directory: %s", strerror(-r));
2286                         return r;
2287                 }
2288         }
2289
2290         return 0;
2291 }
2292
2293 static void loop_remove(int nr, int *image_fd) {
2294         _cleanup_close_ int control = -1;
2295
2296         if (nr < 0)
2297                 return;
2298
2299         if (image_fd && *image_fd >= 0) {
2300                 ioctl(*image_fd, LOOP_CLR_FD);
2301                 *image_fd = safe_close(*image_fd);
2302         }
2303
2304         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2305         if (control < 0)
2306                 return;
2307
2308         ioctl(control, LOOP_CTL_REMOVE, nr);
2309 }
2310
2311 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2312         int pipe_fds[2];
2313         pid_t pid;
2314
2315         assert(database);
2316         assert(key);
2317         assert(rpid);
2318
2319         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2320                 log_error("Failed to allocate pipe: %m");
2321                 return -errno;
2322         }
2323
2324         pid = fork();
2325         if (pid < 0) {
2326                 log_error("Failed to fork getent child: %m");
2327                 return -errno;
2328         } else if (pid == 0) {
2329                 int nullfd;
2330                 char *empty_env = NULL;
2331
2332                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2333                         _exit(EXIT_FAILURE);
2334
2335                 if (pipe_fds[0] > 2)
2336                         safe_close(pipe_fds[0]);
2337                 if (pipe_fds[1] > 2)
2338                         safe_close(pipe_fds[1]);
2339
2340                 nullfd = open("/dev/null", O_RDWR);
2341                 if (nullfd < 0)
2342                         _exit(EXIT_FAILURE);
2343
2344                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2345                         _exit(EXIT_FAILURE);
2346
2347                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2348                         _exit(EXIT_FAILURE);
2349
2350                 if (nullfd > 2)
2351                         safe_close(nullfd);
2352
2353                 reset_all_signal_handlers();
2354                 close_all_fds(NULL, 0);
2355
2356                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2357                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2358                 _exit(EXIT_FAILURE);
2359         }
2360
2361         pipe_fds[1] = safe_close(pipe_fds[1]);
2362
2363         *rpid = pid;
2364
2365         return pipe_fds[0];
2366 }
2367
2368 static int change_uid_gid(char **_home) {
2369         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2370         _cleanup_free_ uid_t *uids = NULL;
2371         _cleanup_free_ char *home = NULL;
2372         _cleanup_fclose_ FILE *f = NULL;
2373         _cleanup_close_ int fd = -1;
2374         unsigned n_uids = 0;
2375         size_t sz = 0, l;
2376         uid_t uid;
2377         gid_t gid;
2378         pid_t pid;
2379         int r;
2380
2381         assert(_home);
2382
2383         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2384                 /* Reset everything fully to 0, just in case */
2385
2386                 if (setgroups(0, NULL) < 0) {
2387                         log_error("setgroups() failed: %m");
2388                         return -errno;
2389                 }
2390
2391                 if (setresgid(0, 0, 0) < 0) {
2392                         log_error("setregid() failed: %m");
2393                         return -errno;
2394                 }
2395
2396                 if (setresuid(0, 0, 0) < 0) {
2397                         log_error("setreuid() failed: %m");
2398                         return -errno;
2399                 }
2400
2401                 *_home = NULL;
2402                 return 0;
2403         }
2404
2405         /* First, get user credentials */
2406         fd = spawn_getent("passwd", arg_user, &pid);
2407         if (fd < 0)
2408                 return fd;
2409
2410         f = fdopen(fd, "r");
2411         if (!f)
2412                 return log_oom();
2413         fd = -1;
2414
2415         if (!fgets(line, sizeof(line), f)) {
2416
2417                 if (!ferror(f)) {
2418                         log_error("Failed to resolve user %s.", arg_user);
2419                         return -ESRCH;
2420                 }
2421
2422                 log_error("Failed to read from getent: %m");
2423                 return -errno;
2424         }
2425
2426         truncate_nl(line);
2427
2428         wait_for_terminate_and_warn("getent passwd", pid);
2429
2430         x = strchr(line, ':');
2431         if (!x) {
2432                 log_error("/etc/passwd entry has invalid user field.");
2433                 return -EIO;
2434         }
2435
2436         u = strchr(x+1, ':');
2437         if (!u) {
2438                 log_error("/etc/passwd entry has invalid password field.");
2439                 return -EIO;
2440         }
2441
2442         u++;
2443         g = strchr(u, ':');
2444         if (!g) {
2445                 log_error("/etc/passwd entry has invalid UID field.");
2446                 return -EIO;
2447         }
2448
2449         *g = 0;
2450         g++;
2451         x = strchr(g, ':');
2452         if (!x) {
2453                 log_error("/etc/passwd entry has invalid GID field.");
2454                 return -EIO;
2455         }
2456
2457         *x = 0;
2458         h = strchr(x+1, ':');
2459         if (!h) {
2460                 log_error("/etc/passwd entry has invalid GECOS field.");
2461                 return -EIO;
2462         }
2463
2464         h++;
2465         x = strchr(h, ':');
2466         if (!x) {
2467                 log_error("/etc/passwd entry has invalid home directory field.");
2468                 return -EIO;
2469         }
2470
2471         *x = 0;
2472
2473         r = parse_uid(u, &uid);
2474         if (r < 0) {
2475                 log_error("Failed to parse UID of user.");
2476                 return -EIO;
2477         }
2478
2479         r = parse_gid(g, &gid);
2480         if (r < 0) {
2481                 log_error("Failed to parse GID of user.");
2482                 return -EIO;
2483         }
2484
2485         home = strdup(h);
2486         if (!home)
2487                 return log_oom();
2488
2489         /* Second, get group memberships */
2490         fd = spawn_getent("initgroups", arg_user, &pid);
2491         if (fd < 0)
2492                 return fd;
2493
2494         fclose(f);
2495         f = fdopen(fd, "r");
2496         if (!f)
2497                 return log_oom();
2498         fd = -1;
2499
2500         if (!fgets(line, sizeof(line), f)) {
2501                 if (!ferror(f)) {
2502                         log_error("Failed to resolve user %s.", arg_user);
2503                         return -ESRCH;
2504                 }
2505
2506                 log_error("Failed to read from getent: %m");
2507                 return -errno;
2508         }
2509
2510         truncate_nl(line);
2511
2512         wait_for_terminate_and_warn("getent initgroups", pid);
2513
2514         /* Skip over the username and subsequent separator whitespace */
2515         x = line;
2516         x += strcspn(x, WHITESPACE);
2517         x += strspn(x, WHITESPACE);
2518
2519         FOREACH_WORD(w, l, x, state) {
2520                 char c[l+1];
2521
2522                 memcpy(c, w, l);
2523                 c[l] = 0;
2524
2525                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2526                         return log_oom();
2527
2528                 r = parse_uid(c, &uids[n_uids++]);
2529                 if (r < 0) {
2530                         log_error("Failed to parse group data from getent.");
2531                         return -EIO;
2532                 }
2533         }
2534
2535         r = mkdir_parents(home, 0775);
2536         if (r < 0) {
2537                 log_error("Failed to make home root directory: %s", strerror(-r));
2538                 return r;
2539         }
2540
2541         r = mkdir_safe(home, 0755, uid, gid);
2542         if (r < 0 && r != -EEXIST) {
2543                 log_error("Failed to make home directory: %s", strerror(-r));
2544                 return r;
2545         }
2546
2547         fchown(STDIN_FILENO, uid, gid);
2548         fchown(STDOUT_FILENO, uid, gid);
2549         fchown(STDERR_FILENO, uid, gid);
2550
2551         if (setgroups(n_uids, uids) < 0) {
2552                 log_error("Failed to set auxiliary groups: %m");
2553                 return -errno;
2554         }
2555
2556         if (setresgid(gid, gid, gid) < 0) {
2557                 log_error("setregid() failed: %m");
2558                 return -errno;
2559         }
2560
2561         if (setresuid(uid, uid, uid) < 0) {
2562                 log_error("setreuid() failed: %m");
2563                 return -errno;
2564         }
2565
2566         if (_home) {
2567                 *_home = home;
2568                 home = NULL;
2569         }
2570
2571         return 0;
2572 }
2573
2574 int main(int argc, char *argv[]) {
2575
2576         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2577         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2578         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2579         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2580         _cleanup_fdset_free_ FDSet *fds = NULL;
2581         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2582         const char *console = NULL;
2583         char veth_name[IFNAMSIZ];
2584         bool secondary = false;
2585         pid_t pid = 0;
2586         sigset_t mask;
2587
2588         log_parse_environment();
2589         log_open();
2590
2591         k = parse_argv(argc, argv);
2592         if (k < 0)
2593                 goto finish;
2594         else if (k == 0) {
2595                 r = EXIT_SUCCESS;
2596                 goto finish;
2597         }
2598
2599         if (!arg_image) {
2600                 if (arg_directory) {
2601                         char *p;
2602
2603                         p = path_make_absolute_cwd(arg_directory);
2604                         free(arg_directory);
2605                         arg_directory = p;
2606                 } else
2607                         arg_directory = get_current_dir_name();
2608
2609                 if (!arg_directory) {
2610                         log_error("Failed to determine path, please use -D.");
2611                         goto finish;
2612                 }
2613                 path_kill_slashes(arg_directory);
2614         }
2615
2616         if (!arg_machine) {
2617                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2618                 if (!arg_machine) {
2619                         log_oom();
2620                         goto finish;
2621                 }
2622
2623                 hostname_cleanup(arg_machine, false);
2624                 if (isempty(arg_machine)) {
2625                         log_error("Failed to determine machine name automatically, please use -M.");
2626                         goto finish;
2627                 }
2628         }
2629
2630         if (geteuid() != 0) {
2631                 log_error("Need to be root.");
2632                 goto finish;
2633         }
2634
2635         if (sd_booted() <= 0) {
2636                 log_error("Not running on a systemd system.");
2637                 goto finish;
2638         }
2639
2640         log_close();
2641         n_fd_passed = sd_listen_fds(false);
2642         if (n_fd_passed > 0) {
2643                 k = fdset_new_listen_fds(&fds, false);
2644                 if (k < 0) {
2645                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2646                         goto finish;
2647                 }
2648         }
2649         fdset_close_others(fds);
2650         log_open();
2651
2652         if (arg_directory) {
2653                 if (path_equal(arg_directory, "/")) {
2654                         log_error("Spawning container on root directory not supported.");
2655                         goto finish;
2656                 }
2657
2658                 if (arg_boot) {
2659                         if (path_is_os_tree(arg_directory) <= 0) {
2660                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2661                                 goto finish;
2662                         }
2663                 } else {
2664                         const char *p;
2665
2666                         p = strappenda(arg_directory,
2667                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2668                         if (access(p, F_OK) < 0) {
2669                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2670                                 goto finish;
2671
2672                         }
2673                 }
2674         } else {
2675                 char template[] = "/tmp/nspawn-root-XXXXXX";
2676
2677                 if (!mkdtemp(template)) {
2678                         log_error("Failed to create temporary directory: %m");
2679                         r = -errno;
2680                         goto finish;
2681                 }
2682
2683                 arg_directory = strdup(template);
2684                 if (!arg_directory) {
2685                         r = log_oom();
2686                         goto finish;
2687                 }
2688
2689                 image_fd = setup_image(&device_path, &loop_nr);
2690                 if (image_fd < 0) {
2691                         r = image_fd;
2692                         goto finish;
2693                 }
2694
2695                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2696                 if (r < 0)
2697                         goto finish;
2698         }
2699
2700         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2701         if (master < 0) {
2702                 log_error("Failed to acquire pseudo tty: %m");
2703                 goto finish;
2704         }
2705
2706         console = ptsname(master);
2707         if (!console) {
2708                 log_error("Failed to determine tty name: %m");
2709                 goto finish;
2710         }
2711
2712         if (!arg_quiet)
2713                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2714
2715         if (unlockpt(master) < 0) {
2716                 log_error("Failed to unlock tty: %m");
2717                 goto finish;
2718         }
2719
2720         if (access("/dev/kdbus/control", F_OK) >= 0) {
2721
2722                 if (arg_share_system) {
2723                         kdbus_domain = strdup("/dev/kdbus");
2724                         if (!kdbus_domain) {
2725                                 log_oom();
2726                                 goto finish;
2727                         }
2728                 } else {
2729                         const char *ns;
2730
2731                         ns = strappenda("machine-", arg_machine);
2732                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2733                         if (r < 0)
2734                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2735                         else
2736                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2737                 }
2738         }
2739
2740         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2741                 log_error("Failed to create kmsg socket pair: %m");
2742                 goto finish;
2743         }
2744
2745         sd_notify(0, "READY=1");
2746
2747         assert_se(sigemptyset(&mask) == 0);
2748         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2749         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2750
2751         for (;;) {
2752                 int parent_ready_fd = -1, child_ready_fd = -1;
2753                 siginfo_t status;
2754                 eventfd_t x;
2755
2756                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2757                 if (parent_ready_fd < 0) {
2758                         log_error("Failed to create event fd: %m");
2759                         goto finish;
2760                 }
2761
2762                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2763                 if (child_ready_fd < 0) {
2764                         log_error("Failed to create event fd: %m");
2765                         goto finish;
2766                 }
2767
2768                 pid = syscall(__NR_clone,
2769                               SIGCHLD|CLONE_NEWNS|
2770                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2771                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2772                 if (pid < 0) {
2773                         if (errno == EINVAL)
2774                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2775                         else
2776                                 log_error("clone() failed: %m");
2777
2778                         goto finish;
2779                 }
2780
2781                 if (pid == 0) {
2782                         /* child */
2783                         _cleanup_free_ char *home = NULL;
2784                         unsigned n_env = 2;
2785                         const char *envp[] = {
2786                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2787                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2788                                 NULL, /* TERM */
2789                                 NULL, /* HOME */
2790                                 NULL, /* USER */
2791                                 NULL, /* LOGNAME */
2792                                 NULL, /* container_uuid */
2793                                 NULL, /* LISTEN_FDS */
2794                                 NULL, /* LISTEN_PID */
2795                                 NULL
2796                         };
2797                         char **env_use;
2798
2799                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2800                         if (envp[n_env])
2801                                 n_env ++;
2802
2803                         master = safe_close(master);
2804
2805                         close_nointr(STDIN_FILENO);
2806                         close_nointr(STDOUT_FILENO);
2807                         close_nointr(STDERR_FILENO);
2808
2809                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2810
2811                         reset_all_signal_handlers();
2812
2813                         assert_se(sigemptyset(&mask) == 0);
2814                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2815
2816                         k = open_terminal(console, O_RDWR);
2817                         if (k != STDIN_FILENO) {
2818                                 if (k >= 0) {
2819                                         safe_close(k);
2820                                         k = -EINVAL;
2821                                 }
2822
2823                                 log_error("Failed to open console: %s", strerror(-k));
2824                                 goto child_fail;
2825                         }
2826
2827                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2828                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2829                                 log_error("Failed to duplicate console: %m");
2830                                 goto child_fail;
2831                         }
2832
2833                         if (setsid() < 0) {
2834                                 log_error("setsid() failed: %m");
2835                                 goto child_fail;
2836                         }
2837
2838                         if (reset_audit_loginuid() < 0)
2839                                 goto child_fail;
2840
2841                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2842                                 log_error("PR_SET_PDEATHSIG failed: %m");
2843                                 goto child_fail;
2844                         }
2845
2846                         /* Mark everything as slave, so that we still
2847                          * receive mounts from the real root, but don't
2848                          * propagate mounts to the real root. */
2849                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2850                                 log_error("MS_SLAVE|MS_REC failed: %m");
2851                                 goto child_fail;
2852                         }
2853
2854                         if (mount_devices(arg_directory,
2855                                           root_device, root_device_rw,
2856                                           home_device, home_device_rw,
2857                                           srv_device, srv_device_rw) < 0)
2858                                 goto child_fail;
2859
2860                         /* Turn directory into bind mount */
2861                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2862                                 log_error("Failed to make bind mount.");
2863                                 goto child_fail;
2864                         }
2865
2866                         if (arg_read_only)
2867                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2868                                         log_error("Failed to make read-only.");
2869                                         goto child_fail;
2870                                 }
2871
2872                         if (mount_all(arg_directory) < 0)
2873                                 goto child_fail;
2874
2875                         if (copy_devnodes(arg_directory) < 0)
2876                                 goto child_fail;
2877
2878                         if (setup_ptmx(arg_directory) < 0)
2879                                 goto child_fail;
2880
2881                         dev_setup(arg_directory);
2882
2883                         if (audit_still_doesnt_work_in_containers() < 0)
2884                                 goto child_fail;
2885
2886                         if (setup_dev_console(arg_directory, console) < 0)
2887                                 goto child_fail;
2888
2889                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2890                                 goto child_fail;
2891
2892                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2893
2894                         if (setup_boot_id(arg_directory) < 0)
2895                                 goto child_fail;
2896
2897                         if (setup_timezone(arg_directory) < 0)
2898                                 goto child_fail;
2899
2900                         if (setup_resolv_conf(arg_directory) < 0)
2901                                 goto child_fail;
2902
2903                         if (setup_journal(arg_directory) < 0)
2904                                 goto child_fail;
2905
2906                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2907                                 goto child_fail;
2908
2909                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2910                                 goto child_fail;
2911
2912                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2913                                 goto child_fail;
2914
2915                         /* Tell the parent that we are ready, and that
2916                          * it can cgroupify us to that we lack access
2917                          * to certain devices and resources. */
2918                         eventfd_write(child_ready_fd, 1);
2919                         child_ready_fd = safe_close(child_ready_fd);
2920
2921                         if (chdir(arg_directory) < 0) {
2922                                 log_error("chdir(%s) failed: %m", arg_directory);
2923                                 goto child_fail;
2924                         }
2925
2926                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2927                                 log_error("mount(MS_MOVE) failed: %m");
2928                                 goto child_fail;
2929                         }
2930
2931                         if (chroot(".") < 0) {
2932                                 log_error("chroot() failed: %m");
2933                                 goto child_fail;
2934                         }
2935
2936                         if (chdir("/") < 0) {
2937                                 log_error("chdir() failed: %m");
2938                                 goto child_fail;
2939                         }
2940
2941                         umask(0022);
2942
2943                         if (arg_private_network)
2944                                 loopback_setup();
2945
2946                         if (drop_capabilities() < 0) {
2947                                 log_error("drop_capabilities() failed: %m");
2948                                 goto child_fail;
2949                         }
2950
2951                         r = change_uid_gid(&home);
2952                         if (r < 0)
2953                                 goto child_fail;
2954
2955                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2956                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2957                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2958                                 log_oom();
2959                                 goto child_fail;
2960                         }
2961
2962                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2963                                 char as_uuid[37];
2964
2965                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
2966                                         log_oom();
2967                                         goto child_fail;
2968                                 }
2969                         }
2970
2971                         if (fdset_size(fds) > 0) {
2972                                 k = fdset_cloexec(fds, false);
2973                                 if (k < 0) {
2974                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2975                                         goto child_fail;
2976                                 }
2977
2978                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2979                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2980                                         log_oom();
2981                                         goto child_fail;
2982                                 }
2983                         }
2984
2985                         setup_hostname();
2986
2987                         if (arg_personality != 0xffffffffLU) {
2988                                 if (personality(arg_personality) < 0) {
2989                                         log_error("personality() failed: %m");
2990                                         goto child_fail;
2991                                 }
2992                         } else if (secondary) {
2993                                 if (personality(PER_LINUX32) < 0) {
2994                                         log_error("personality() failed: %m");
2995                                         goto child_fail;
2996                                 }
2997                         }
2998
2999 #ifdef HAVE_SELINUX
3000                         if (arg_selinux_context)
3001                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3002                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3003                                         goto child_fail;
3004                                 }
3005 #endif
3006
3007                         if (!strv_isempty(arg_setenv)) {
3008                                 char **n;
3009
3010                                 n = strv_env_merge(2, envp, arg_setenv);
3011                                 if (!n) {
3012                                         log_oom();
3013                                         goto child_fail;
3014                                 }
3015
3016                                 env_use = n;
3017                         } else
3018                                 env_use = (char**) envp;
3019
3020                         /* Wait until the parent is ready with the setup, too... */
3021                         eventfd_read(parent_ready_fd, &x);
3022                         parent_ready_fd = safe_close(parent_ready_fd);
3023
3024                         if (arg_boot) {
3025                                 char **a;
3026                                 size_t l;
3027
3028                                 /* Automatically search for the init system */
3029
3030                                 l = 1 + argc - optind;
3031                                 a = newa(char*, l + 1);
3032                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3033
3034                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3035                                 execve(a[0], a, env_use);
3036
3037                                 a[0] = (char*) "/lib/systemd/systemd";
3038                                 execve(a[0], a, env_use);
3039
3040                                 a[0] = (char*) "/sbin/init";
3041                                 execve(a[0], a, env_use);
3042                         } else if (argc > optind)
3043                                 execvpe(argv[optind], argv + optind, env_use);
3044                         else {
3045                                 chdir(home ? home : "/root");
3046                                 execle("/bin/bash", "-bash", NULL, env_use);
3047                                 execle("/bin/sh", "-sh", NULL, env_use);
3048                         }
3049
3050                         log_error("execv() failed: %m");
3051
3052                 child_fail:
3053                         _exit(EXIT_FAILURE);
3054                 }
3055
3056                 fdset_free(fds);
3057                 fds = NULL;
3058
3059                 /* Wait until the child reported that it is ready with
3060                  * all it needs to do with privileges. After we got
3061                  * the notification we can make the process join its
3062                  * cgroup which might limit what it can do */
3063                 eventfd_read(child_ready_fd, &x);
3064
3065                 r = register_machine(pid);
3066                 if (r < 0)
3067                         goto finish;
3068
3069                 r = move_network_interfaces(pid);
3070                 if (r < 0)
3071                         goto finish;
3072
3073                 r = setup_veth(pid, veth_name);
3074                 if (r < 0)
3075                         goto finish;
3076
3077                 r = setup_bridge(veth_name);
3078                 if (r < 0)
3079                         goto finish;
3080
3081                 r = setup_macvlan(pid);
3082                 if (r < 0)
3083                         goto finish;
3084
3085                 /* Notify the child that the parent is ready with all
3086                  * its setup, and thtat the child can now hand over
3087                  * control to the code to run inside the container. */
3088                 eventfd_write(parent_ready_fd, 1);
3089
3090                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3091                 if (k < 0) {
3092                         r = EXIT_FAILURE;
3093                         break;
3094                 }
3095
3096                 if (!arg_quiet)
3097                         putc('\n', stdout);
3098
3099                 /* Kill if it is not dead yet anyway */
3100                 terminate_machine(pid);
3101
3102                 /* Redundant, but better safe than sorry */
3103                 kill(pid, SIGKILL);
3104
3105                 k = wait_for_terminate(pid, &status);
3106                 pid = 0;
3107
3108                 if (k < 0) {
3109                         r = EXIT_FAILURE;
3110                         break;
3111                 }
3112
3113                 if (status.si_code == CLD_EXITED) {
3114                         r = status.si_status;
3115                         if (status.si_status != 0) {
3116                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3117                                 break;
3118                         }
3119
3120                         if (!arg_quiet)
3121                                 log_debug("Container %s exited successfully.", arg_machine);
3122                         break;
3123                 } else if (status.si_code == CLD_KILLED &&
3124                            status.si_status == SIGINT) {
3125
3126                         if (!arg_quiet)
3127                                 log_info("Container %s has been shut down.", arg_machine);
3128                         r = 0;
3129                         break;
3130                 } else if (status.si_code == CLD_KILLED &&
3131                            status.si_status == SIGHUP) {
3132
3133                         if (!arg_quiet)
3134                                 log_info("Container %s is being rebooted.", arg_machine);
3135                         continue;
3136                 } else if (status.si_code == CLD_KILLED ||
3137                            status.si_code == CLD_DUMPED) {
3138
3139                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3140                         r = EXIT_FAILURE;
3141                         break;
3142                 } else {
3143                         log_error("Container %s failed due to unknown reason.", arg_machine);
3144                         r = EXIT_FAILURE;
3145                         break;
3146                 }
3147         }
3148
3149 finish:
3150         loop_remove(loop_nr, &image_fd);
3151
3152         if (pid > 0)
3153                 kill(pid, SIGKILL);
3154
3155         free(arg_directory);
3156         free(arg_machine);
3157         free(arg_user);
3158         strv_free(arg_setenv);
3159         strv_free(arg_network_interfaces);
3160         strv_free(arg_network_macvlan);
3161         strv_free(arg_bind);
3162         strv_free(arg_bind_ro);
3163
3164         return r;
3165 }