chiark / gitweb /
ef84664dfeefff4dbdf61387c3c9e3d7e7e179ac
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94
95 typedef enum LinkJournal {
96         LINK_NO,
97         LINK_AUTO,
98         LINK_HOST,
99         LINK_GUEST
100 } LinkJournal;
101
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114         (1ULL << CAP_CHOWN) |
115         (1ULL << CAP_DAC_OVERRIDE) |
116         (1ULL << CAP_DAC_READ_SEARCH) |
117         (1ULL << CAP_FOWNER) |
118         (1ULL << CAP_FSETID) |
119         (1ULL << CAP_IPC_OWNER) |
120         (1ULL << CAP_KILL) |
121         (1ULL << CAP_LEASE) |
122         (1ULL << CAP_LINUX_IMMUTABLE) |
123         (1ULL << CAP_NET_BIND_SERVICE) |
124         (1ULL << CAP_NET_BROADCAST) |
125         (1ULL << CAP_NET_RAW) |
126         (1ULL << CAP_SETGID) |
127         (1ULL << CAP_SETFCAP) |
128         (1ULL << CAP_SETPCAP) |
129         (1ULL << CAP_SETUID) |
130         (1ULL << CAP_SYS_ADMIN) |
131         (1ULL << CAP_SYS_CHROOT) |
132         (1ULL << CAP_SYS_NICE) |
133         (1ULL << CAP_SYS_PTRACE) |
134         (1ULL << CAP_SYS_TTY_CONFIG) |
135         (1ULL << CAP_SYS_RESOURCE) |
136         (1ULL << CAP_SYS_BOOT) |
137         (1ULL << CAP_AUDIT_WRITE) |
138         (1ULL << CAP_AUDIT_CONTROL) |
139         (1ULL << CAP_MKNOD);
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
153
154 static int help(void) {
155
156         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158                "  -h --help                 Show this help\n"
159                "     --version              Print version string\n"
160                "  -q --quiet                Do not show status information\n"
161                "  -D --directory=PATH       Root directory for the container\n"
162                "  -i --image=PATH           File system device or image for the container\n"
163                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
164                "  -u --user=USER            Run the command under specified user or uid\n"
165                "  -M --machine=NAME         Set the machine name for the container\n"
166                "     --uuid=UUID            Set a specific machine UUID for the container\n"
167                "  -S --slice=SLICE          Place the container in the specified slice\n"
168                "     --private-network      Disable network in container\n"
169                "     --network-interface=INTERFACE\n"
170                "                            Assign an existing network interface to the\n"
171                "                            container\n"
172                "     --network-macvlan=INTERFACE\n"
173                "                            Create a macvlan network interface based on an\n"
174                "                            existing network interface to the container\n"
175                "     --network-veth         Add a virtual ethernet connection between host\n"
176                "                            and container\n"
177                "     --network-bridge=INTERFACE\n"
178                "                            Add a virtual ethernet connection between host\n"
179                "                            and container and add it to an existing bridge on\n"
180                "                            the host\n"
181                "  -Z --selinux-context=SECLABEL\n"
182                "                            Set the SELinux security context to be used by\n"
183                "                            processes in the container\n"
184                "  -L --selinux-apifs-context=SECLABEL\n"
185                "                            Set the SELinux security context to be used by\n"
186                "                            API/tmpfs file systems in the container\n"
187                "     --capability=CAP       In addition to the default, retain specified\n"
188                "                            capability\n"
189                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
190                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
191                "  -j                        Equivalent to --link-journal=host\n"
192                "     --read-only            Mount the root directory read-only\n"
193                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
194                "                            the container\n"
195                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
196                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
197                "     --share-system         Share system namespaces with host\n"
198                "     --register=BOOLEAN     Register container as machine\n"
199                "     --keep-unit            Do not register a scope for the machine, reuse\n"
200                "                            the service unit nspawn is running in\n",
201                program_invocation_short_name);
202
203         return 0;
204 }
205
206 static int parse_argv(int argc, char *argv[]) {
207
208         enum {
209                 ARG_VERSION = 0x100,
210                 ARG_PRIVATE_NETWORK,
211                 ARG_UUID,
212                 ARG_READ_ONLY,
213                 ARG_CAPABILITY,
214                 ARG_DROP_CAPABILITY,
215                 ARG_LINK_JOURNAL,
216                 ARG_BIND,
217                 ARG_BIND_RO,
218                 ARG_SETENV,
219                 ARG_SHARE_SYSTEM,
220                 ARG_REGISTER,
221                 ARG_KEEP_UNIT,
222                 ARG_NETWORK_INTERFACE,
223                 ARG_NETWORK_MACVLAN,
224                 ARG_NETWORK_VETH,
225                 ARG_NETWORK_BRIDGE,
226                 ARG_PERSONALITY,
227         };
228
229         static const struct option options[] = {
230                 { "help",                  no_argument,       NULL, 'h'                   },
231                 { "version",               no_argument,       NULL, ARG_VERSION           },
232                 { "directory",             required_argument, NULL, 'D'                   },
233                 { "user",                  required_argument, NULL, 'u'                   },
234                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
235                 { "boot",                  no_argument,       NULL, 'b'                   },
236                 { "uuid",                  required_argument, NULL, ARG_UUID              },
237                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
238                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
239                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
240                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
241                 { "bind",                  required_argument, NULL, ARG_BIND              },
242                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
243                 { "machine",               required_argument, NULL, 'M'                   },
244                 { "slice",                 required_argument, NULL, 'S'                   },
245                 { "setenv",                required_argument, NULL, ARG_SETENV            },
246                 { "selinux-context",       required_argument, NULL, 'Z'                   },
247                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
248                 { "quiet",                 no_argument,       NULL, 'q'                   },
249                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
250                 { "register",              required_argument, NULL, ARG_REGISTER          },
251                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
252                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
253                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
254                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
255                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
256                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
257                 { "image",                 required_argument, NULL, 'i'                   },
258                 {}
259         };
260
261         int c, r;
262         uint64_t plus = 0, minus = 0;
263
264         assert(argc >= 0);
265         assert(argv);
266
267         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
268
269                 switch (c) {
270
271                 case 'h':
272                         return help();
273
274                 case ARG_VERSION:
275                         puts(PACKAGE_STRING);
276                         puts(SYSTEMD_FEATURES);
277                         return 0;
278
279                 case 'D':
280                         free(arg_directory);
281                         arg_directory = canonicalize_file_name(optarg);
282                         if (!arg_directory) {
283                                 log_error("Invalid root directory: %m");
284                                 return -ENOMEM;
285                         }
286
287                         break;
288
289                 case 'i':
290                         arg_image = optarg;
291                         break;
292
293                 case 'u':
294                         free(arg_user);
295                         arg_user = strdup(optarg);
296                         if (!arg_user)
297                                 return log_oom();
298
299                         break;
300
301                 case ARG_NETWORK_BRIDGE:
302                         arg_network_bridge = optarg;
303
304                         /* fall through */
305
306                 case ARG_NETWORK_VETH:
307                         arg_network_veth = true;
308                         arg_private_network = true;
309                         break;
310
311                 case ARG_NETWORK_INTERFACE:
312                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
313                                 return log_oom();
314
315                         arg_private_network = true;
316                         break;
317
318                 case ARG_NETWORK_MACVLAN:
319                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
320                                 return log_oom();
321
322                         /* fall through */
323
324                 case ARG_PRIVATE_NETWORK:
325                         arg_private_network = true;
326                         break;
327
328                 case 'b':
329                         arg_boot = true;
330                         break;
331
332                 case ARG_UUID:
333                         r = sd_id128_from_string(optarg, &arg_uuid);
334                         if (r < 0) {
335                                 log_error("Invalid UUID: %s", optarg);
336                                 return r;
337                         }
338                         break;
339
340                 case 'S':
341                         arg_slice = optarg;
342                         break;
343
344                 case 'M':
345                         if (isempty(optarg)) {
346                                 free(arg_machine);
347                                 arg_machine = NULL;
348                         } else {
349
350                                 if (!hostname_is_valid(optarg)) {
351                                         log_error("Invalid machine name: %s", optarg);
352                                         return -EINVAL;
353                                 }
354
355                                 free(arg_machine);
356                                 arg_machine = strdup(optarg);
357                                 if (!arg_machine)
358                                         return log_oom();
359
360                                 break;
361                         }
362
363                 case 'Z':
364                         arg_selinux_context = optarg;
365                         break;
366
367                 case 'L':
368                         arg_selinux_apifs_context = optarg;
369                         break;
370
371                 case ARG_READ_ONLY:
372                         arg_read_only = true;
373                         break;
374
375                 case ARG_CAPABILITY:
376                 case ARG_DROP_CAPABILITY: {
377                         char *state, *word;
378                         size_t length;
379
380                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381                                 _cleanup_free_ char *t;
382                                 cap_value_t cap;
383
384                                 t = strndup(word, length);
385                                 if (!t)
386                                         return log_oom();
387
388                                 if (streq(t, "all")) {
389                                         if (c == ARG_CAPABILITY)
390                                                 plus = (uint64_t) -1;
391                                         else
392                                                 minus = (uint64_t) -1;
393                                 } else {
394                                         if (cap_from_name(t, &cap) < 0) {
395                                                 log_error("Failed to parse capability %s.", t);
396                                                 return -EINVAL;
397                                         }
398
399                                         if (c == ARG_CAPABILITY)
400                                                 plus |= 1ULL << (uint64_t) cap;
401                                         else
402                                                 minus |= 1ULL << (uint64_t) cap;
403                                 }
404                         }
405
406                         break;
407                 }
408
409                 case 'j':
410                         arg_link_journal = LINK_GUEST;
411                         break;
412
413                 case ARG_LINK_JOURNAL:
414                         if (streq(optarg, "auto"))
415                                 arg_link_journal = LINK_AUTO;
416                         else if (streq(optarg, "no"))
417                                 arg_link_journal = LINK_NO;
418                         else if (streq(optarg, "guest"))
419                                 arg_link_journal = LINK_GUEST;
420                         else if (streq(optarg, "host"))
421                                 arg_link_journal = LINK_HOST;
422                         else {
423                                 log_error("Failed to parse link journal mode %s", optarg);
424                                 return -EINVAL;
425                         }
426
427                         break;
428
429                 case ARG_BIND:
430                 case ARG_BIND_RO: {
431                         _cleanup_free_ char *a = NULL, *b = NULL;
432                         char *e;
433                         char ***x;
434
435                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436
437                         e = strchr(optarg, ':');
438                         if (e) {
439                                 a = strndup(optarg, e - optarg);
440                                 b = strdup(e + 1);
441                         } else {
442                                 a = strdup(optarg);
443                                 b = strdup(optarg);
444                         }
445
446                         if (!a || !b)
447                                 return log_oom();
448
449                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
450                                 log_error("Invalid bind mount specification: %s", optarg);
451                                 return -EINVAL;
452                         }
453
454                         r = strv_extend(x, a);
455                         if (r < 0)
456                                 return log_oom();
457
458                         r = strv_extend(x, b);
459                         if (r < 0)
460                                 return log_oom();
461
462                         break;
463                 }
464
465                 case ARG_SETENV: {
466                         char **n;
467
468                         if (!env_assignment_is_valid(optarg)) {
469                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
470                                 return -EINVAL;
471                         }
472
473                         n = strv_env_set(arg_setenv, optarg);
474                         if (!n)
475                                 return log_oom();
476
477                         strv_free(arg_setenv);
478                         arg_setenv = n;
479                         break;
480                 }
481
482                 case 'q':
483                         arg_quiet = true;
484                         break;
485
486                 case ARG_SHARE_SYSTEM:
487                         arg_share_system = true;
488                         break;
489
490                 case ARG_REGISTER:
491                         r = parse_boolean(optarg);
492                         if (r < 0) {
493                                 log_error("Failed to parse --register= argument: %s", optarg);
494                                 return r;
495                         }
496
497                         arg_register = r;
498                         break;
499
500                 case ARG_KEEP_UNIT:
501                         arg_keep_unit = true;
502                         break;
503
504                 case ARG_PERSONALITY:
505
506                         arg_personality = personality_from_string(optarg);
507                         if (arg_personality == 0xffffffffLU) {
508                                 log_error("Unknown or unsupported personality '%s'.", optarg);
509                                 return -EINVAL;
510                         }
511
512                         break;
513
514                 case '?':
515                         return -EINVAL;
516
517                 default:
518                         assert_not_reached("Unhandled option");
519                 }
520         }
521
522         if (arg_share_system)
523                 arg_register = false;
524
525         if (arg_boot && arg_share_system) {
526                 log_error("--boot and --share-system may not be combined.");
527                 return -EINVAL;
528         }
529
530         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531                 log_error("--keep-unit may not be used when invoked from a user session.");
532                 return -EINVAL;
533         }
534
535         if (arg_directory && arg_image) {
536                 log_error("--directory= and --image= may not be combined.");
537                 return -EINVAL;
538         }
539
540         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
541
542         return 1;
543 }
544
545 static int mount_all(const char *dest) {
546
547         typedef struct MountPoint {
548                 const char *what;
549                 const char *where;
550                 const char *type;
551                 const char *options;
552                 unsigned long flags;
553                 bool fatal;
554         } MountPoint;
555
556         static const MountPoint mount_table[] = {
557                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
558                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
559                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
560                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
561                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
562                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
565 #ifdef HAVE_SELINUX
566                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
567                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
568 #endif
569         };
570
571         unsigned k;
572         int r = 0;
573
574         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575                 _cleanup_free_ char *where = NULL;
576 #ifdef HAVE_SELINUX
577                 _cleanup_free_ char *options = NULL;
578 #endif
579                 const char *o;
580                 int t;
581
582                 where = strjoin(dest, "/", mount_table[k].where, NULL);
583                 if (!where)
584                         return log_oom();
585
586                 t = path_is_mount_point(where, true);
587                 if (t < 0) {
588                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
589
590                         if (r == 0)
591                                 r = t;
592
593                         continue;
594                 }
595
596                 /* Skip this entry if it is not a remount. */
597                 if (mount_table[k].what && t > 0)
598                         continue;
599
600                 mkdir_p(where, 0755);
601
602 #ifdef HAVE_SELINUX
603                 if (arg_selinux_apifs_context &&
604                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
606                         if (!options)
607                                 return log_oom();
608
609                         o = options;
610                 } else
611 #endif
612                         o = mount_table[k].options;
613
614
615                 if (mount(mount_table[k].what,
616                           where,
617                           mount_table[k].type,
618                           mount_table[k].flags,
619                           o) < 0 &&
620                     mount_table[k].fatal) {
621
622                         log_error("mount(%s) failed: %m", where);
623
624                         if (r == 0)
625                                 r = -errno;
626                 }
627         }
628
629         return r;
630 }
631
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
633         char **x, **y;
634
635         STRV_FOREACH_PAIR(x, y, l) {
636                 char *where;
637                 struct stat source_st, dest_st;
638                 int r;
639
640                 if (stat(*x, &source_st) < 0) {
641                         log_error("Failed to stat %s: %m", *x);
642                         return -errno;
643                 }
644
645                 where = strappenda(dest, *y);
646                 r = stat(where, &dest_st);
647                 if (r == 0) {
648                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
650                                                 *x, where);
651                                 return -EINVAL;
652                         }
653                 } else if (errno == ENOENT) {
654                         r = mkdir_parents_label(where, 0755);
655                         if (r < 0) {
656                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
657                                 return r;
658                         }
659                 } else {
660                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
661                         return -errno;
662                 }
663                 /* Create the mount point, but be conservative -- refuse to create block
664                 * and char devices. */
665                 if (S_ISDIR(source_st.st_mode))
666                         mkdir_label(where, 0755);
667                 else if (S_ISFIFO(source_st.st_mode))
668                         mkfifo(where, 0644);
669                 else if (S_ISSOCK(source_st.st_mode))
670                         mknod(where, 0644 | S_IFSOCK, 0);
671                 else if (S_ISREG(source_st.st_mode))
672                         touch(where);
673                 else {
674                         log_error("Refusing to create mountpoint for file: %s", *x);
675                         return -ENOTSUP;
676                 }
677
678                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679                         log_error("mount(%s) failed: %m", where);
680                         return -errno;
681                 }
682
683                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684                         log_error("mount(%s) failed: %m", where);
685                         return -errno;
686                 }
687         }
688
689         return 0;
690 }
691
692 static int setup_timezone(const char *dest) {
693         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
694         char *z, *y;
695         int r;
696
697         assert(dest);
698
699         /* Fix the timezone, if possible */
700         r = readlink_malloc("/etc/localtime", &p);
701         if (r < 0) {
702                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
703                 return 0;
704         }
705
706         z = path_startswith(p, "../usr/share/zoneinfo/");
707         if (!z)
708                 z = path_startswith(p, "/usr/share/zoneinfo/");
709         if (!z) {
710                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
711                 return 0;
712         }
713
714         where = strappend(dest, "/etc/localtime");
715         if (!where)
716                 return log_oom();
717
718         r = readlink_malloc(where, &q);
719         if (r >= 0) {
720                 y = path_startswith(q, "../usr/share/zoneinfo/");
721                 if (!y)
722                         y = path_startswith(q, "/usr/share/zoneinfo/");
723
724
725                 /* Already pointing to the right place? Then do nothing .. */
726                 if (y && streq(y, z))
727                         return 0;
728         }
729
730         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
731         if (!check)
732                 return log_oom();
733
734         if (access(check, F_OK) < 0) {
735                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
736                 return 0;
737         }
738
739         what = strappend("../usr/share/zoneinfo/", z);
740         if (!what)
741                 return log_oom();
742
743         unlink(where);
744         if (symlink(what, where) < 0) {
745                 log_error("Failed to correct timezone of container: %m");
746                 return 0;
747         }
748
749         return 0;
750 }
751
752 static int setup_resolv_conf(const char *dest) {
753         char _cleanup_free_ *where = NULL;
754
755         assert(dest);
756
757         if (arg_private_network)
758                 return 0;
759
760         /* Fix resolv.conf, if possible */
761         where = strappend(dest, "/etc/resolv.conf");
762         if (!where)
763                 return log_oom();
764
765         /* We don't really care for the results of this really. If it
766          * fails, it fails, but meh... */
767         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
768
769         return 0;
770 }
771
772 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
773
774         snprintf(s, 37,
775                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
776                  SD_ID128_FORMAT_VAL(id));
777
778         return s;
779 }
780
781 static int setup_boot_id(const char *dest) {
782         _cleanup_free_ char *from = NULL, *to = NULL;
783         sd_id128_t rnd = {};
784         char as_uuid[37];
785         int r;
786
787         assert(dest);
788
789         if (arg_share_system)
790                 return 0;
791
792         /* Generate a new randomized boot ID, so that each boot-up of
793          * the container gets a new one */
794
795         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
796         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
797         if (!from || !to)
798                 return log_oom();
799
800         r = sd_id128_randomize(&rnd);
801         if (r < 0) {
802                 log_error("Failed to generate random boot id: %s", strerror(-r));
803                 return r;
804         }
805
806         id128_format_as_uuid(rnd, as_uuid);
807
808         r = write_string_file(from, as_uuid);
809         if (r < 0) {
810                 log_error("Failed to write boot id: %s", strerror(-r));
811                 return r;
812         }
813
814         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
815                 log_error("Failed to bind mount boot id: %m");
816                 r = -errno;
817         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
818                 log_warning("Failed to make boot id read-only: %m");
819
820         unlink(from);
821         return r;
822 }
823
824 static int copy_devnodes(const char *dest) {
825
826         static const char devnodes[] =
827                 "null\0"
828                 "zero\0"
829                 "full\0"
830                 "random\0"
831                 "urandom\0"
832                 "tty\0";
833
834         const char *d;
835         int r = 0;
836         _cleanup_umask_ mode_t u;
837
838         assert(dest);
839
840         u = umask(0000);
841
842         NULSTR_FOREACH(d, devnodes) {
843                 _cleanup_free_ char *from = NULL, *to = NULL;
844                 struct stat st;
845
846                 from = strappend("/dev/", d);
847                 to = strjoin(dest, "/dev/", d, NULL);
848                 if (!from || !to)
849                         return log_oom();
850
851                 if (stat(from, &st) < 0) {
852
853                         if (errno != ENOENT) {
854                                 log_error("Failed to stat %s: %m", from);
855                                 return -errno;
856                         }
857
858                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
859
860                         log_error("%s is not a char or block device, cannot copy", from);
861                         return -EIO;
862
863                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
864
865                         log_error("mknod(%s) failed: %m", dest);
866                         return  -errno;
867                 }
868         }
869
870         return r;
871 }
872
873 static int setup_ptmx(const char *dest) {
874         _cleanup_free_ char *p = NULL;
875
876         p = strappend(dest, "/dev/ptmx");
877         if (!p)
878                 return log_oom();
879
880         if (symlink("pts/ptmx", p) < 0) {
881                 log_error("Failed to create /dev/ptmx symlink: %m");
882                 return -errno;
883         }
884
885         return 0;
886 }
887
888 static int setup_dev_console(const char *dest, const char *console) {
889         _cleanup_umask_ mode_t u;
890         const char *to;
891         struct stat st;
892         int r;
893
894         assert(dest);
895         assert(console);
896
897         u = umask(0000);
898
899         if (stat("/dev/null", &st) < 0) {
900                 log_error("Failed to stat /dev/null: %m");
901                 return -errno;
902         }
903
904         r = chmod_and_chown(console, 0600, 0, 0);
905         if (r < 0) {
906                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
907                 return r;
908         }
909
910         /* We need to bind mount the right tty to /dev/console since
911          * ptys can only exist on pts file systems. To have something
912          * to bind mount things on we create a device node first, and
913          * use /dev/null for that since we the cgroups device policy
914          * allows us to create that freely, while we cannot create
915          * /dev/console. (Note that the major minor doesn't actually
916          * matter here, since we mount it over anyway). */
917
918         to = strappenda(dest, "/dev/console");
919         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
920                 log_error("mknod() for /dev/console failed: %m");
921                 return -errno;
922         }
923
924         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
925                 log_error("Bind mount for /dev/console failed: %m");
926                 return -errno;
927         }
928
929         return 0;
930 }
931
932 static int setup_kmsg(const char *dest, int kmsg_socket) {
933         _cleanup_free_ char *from = NULL, *to = NULL;
934         int r, fd, k;
935         _cleanup_umask_ mode_t u;
936         union {
937                 struct cmsghdr cmsghdr;
938                 uint8_t buf[CMSG_SPACE(sizeof(int))];
939         } control = {};
940         struct msghdr mh = {
941                 .msg_control = &control,
942                 .msg_controllen = sizeof(control),
943         };
944         struct cmsghdr *cmsg;
945
946         assert(dest);
947         assert(kmsg_socket >= 0);
948
949         u = umask(0000);
950
951         /* We create the kmsg FIFO as /dev/kmsg, but immediately
952          * delete it after bind mounting it to /proc/kmsg. While FIFOs
953          * on the reading side behave very similar to /proc/kmsg,
954          * their writing side behaves differently from /dev/kmsg in
955          * that writing blocks when nothing is reading. In order to
956          * avoid any problems with containers deadlocking due to this
957          * we simply make /dev/kmsg unavailable to the container. */
958         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
959             asprintf(&to, "%s/proc/kmsg", dest) < 0)
960                 return log_oom();
961
962         if (mkfifo(from, 0600) < 0) {
963                 log_error("mkfifo() for /dev/kmsg failed: %m");
964                 return -errno;
965         }
966
967         r = chmod_and_chown(from, 0600, 0, 0);
968         if (r < 0) {
969                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
970                 return r;
971         }
972
973         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
974                 log_error("Bind mount for /proc/kmsg failed: %m");
975                 return -errno;
976         }
977
978         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
979         if (fd < 0) {
980                 log_error("Failed to open fifo: %m");
981                 return -errno;
982         }
983
984         cmsg = CMSG_FIRSTHDR(&mh);
985         cmsg->cmsg_level = SOL_SOCKET;
986         cmsg->cmsg_type = SCM_RIGHTS;
987         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
988         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
989
990         mh.msg_controllen = cmsg->cmsg_len;
991
992         /* Store away the fd in the socket, so that it stays open as
993          * long as we run the child */
994         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
995         safe_close(fd);
996
997         if (k < 0) {
998                 log_error("Failed to send FIFO fd: %m");
999                 return -errno;
1000         }
1001
1002         /* And now make the FIFO unavailable as /dev/kmsg... */
1003         unlink(from);
1004         return 0;
1005 }
1006
1007 static int setup_hostname(void) {
1008
1009         if (arg_share_system)
1010                 return 0;
1011
1012         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1013                 return -errno;
1014
1015         return 0;
1016 }
1017
1018 static int setup_journal(const char *directory) {
1019         sd_id128_t machine_id, this_id;
1020         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1021         char *id;
1022         int r;
1023
1024         p = strappend(directory, "/etc/machine-id");
1025         if (!p)
1026                 return log_oom();
1027
1028         r = read_one_line_file(p, &b);
1029         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1030                 return 0;
1031         else if (r < 0) {
1032                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1033                 return r;
1034         }
1035
1036         id = strstrip(b);
1037         if (isempty(id) && arg_link_journal == LINK_AUTO)
1038                 return 0;
1039
1040         /* Verify validity */
1041         r = sd_id128_from_string(id, &machine_id);
1042         if (r < 0) {
1043                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1044                 return r;
1045         }
1046
1047         r = sd_id128_get_machine(&this_id);
1048         if (r < 0) {
1049                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1050                 return r;
1051         }
1052
1053         if (sd_id128_equal(machine_id, this_id)) {
1054                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1055                          "Host and machine ids are equal (%s): refusing to link journals", id);
1056                 if (arg_link_journal == LINK_AUTO)
1057                         return 0;
1058                 return
1059                         -EEXIST;
1060         }
1061
1062         if (arg_link_journal == LINK_NO)
1063                 return 0;
1064
1065         free(p);
1066         p = strappend("/var/log/journal/", id);
1067         q = strjoin(directory, "/var/log/journal/", id, NULL);
1068         if (!p || !q)
1069                 return log_oom();
1070
1071         if (path_is_mount_point(p, false) > 0) {
1072                 if (arg_link_journal != LINK_AUTO) {
1073                         log_error("%s: already a mount point, refusing to use for journal", p);
1074                         return -EEXIST;
1075                 }
1076
1077                 return 0;
1078         }
1079
1080         if (path_is_mount_point(q, false) > 0) {
1081                 if (arg_link_journal != LINK_AUTO) {
1082                         log_error("%s: already a mount point, refusing to use for journal", q);
1083                         return -EEXIST;
1084                 }
1085
1086                 return 0;
1087         }
1088
1089         r = readlink_and_make_absolute(p, &d);
1090         if (r >= 0) {
1091                 if ((arg_link_journal == LINK_GUEST ||
1092                      arg_link_journal == LINK_AUTO) &&
1093                     path_equal(d, q)) {
1094
1095                         r = mkdir_p(q, 0755);
1096                         if (r < 0)
1097                                 log_warning("failed to create directory %s: %m", q);
1098                         return 0;
1099                 }
1100
1101                 if (unlink(p) < 0) {
1102                         log_error("Failed to remove symlink %s: %m", p);
1103                         return -errno;
1104                 }
1105         } else if (r == -EINVAL) {
1106
1107                 if (arg_link_journal == LINK_GUEST &&
1108                     rmdir(p) < 0) {
1109
1110                         if (errno == ENOTDIR) {
1111                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1112                                 return r;
1113                         } else {
1114                                 log_error("Failed to remove %s: %m", p);
1115                                 return -errno;
1116                         }
1117                 }
1118         } else if (r != -ENOENT) {
1119                 log_error("readlink(%s) failed: %m", p);
1120                 return r;
1121         }
1122
1123         if (arg_link_journal == LINK_GUEST) {
1124
1125                 if (symlink(q, p) < 0) {
1126                         log_error("Failed to symlink %s to %s: %m", q, p);
1127                         return -errno;
1128                 }
1129
1130                 r = mkdir_p(q, 0755);
1131                 if (r < 0)
1132                         log_warning("failed to create directory %s: %m", q);
1133                 return 0;
1134         }
1135
1136         if (arg_link_journal == LINK_HOST) {
1137                 r = mkdir_p(p, 0755);
1138                 if (r < 0) {
1139                         log_error("Failed to create %s: %m", p);
1140                         return r;
1141                 }
1142
1143         } else if (access(p, F_OK) < 0)
1144                 return 0;
1145
1146         r = mkdir_p(q, 0755);
1147         if (r < 0) {
1148                 log_error("Failed to create %s: %m", q);
1149                 return r;
1150         }
1151
1152         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1153                 log_error("Failed to bind mount journal from host into guest: %m");
1154                 return -errno;
1155         }
1156
1157         return 0;
1158 }
1159
1160 static int setup_kdbus(const char *dest, const char *path) {
1161         const char *p;
1162
1163         if (!path)
1164                 return 0;
1165
1166         p = strappenda(dest, "/dev/kdbus");
1167         if (mkdir(p, 0755) < 0) {
1168                 log_error("Failed to create kdbus path: %m");
1169                 return  -errno;
1170         }
1171
1172         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1173                 log_error("Failed to mount kdbus domain path: %m");
1174                 return -errno;
1175         }
1176
1177         return 0;
1178 }
1179
1180 static int drop_capabilities(void) {
1181         return capability_bounding_set_drop(~arg_retain, false);
1182 }
1183
1184 static int register_machine(pid_t pid) {
1185         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1186         _cleanup_bus_unref_ sd_bus *bus = NULL;
1187         int r;
1188
1189         if (!arg_register)
1190                 return 0;
1191
1192         r = sd_bus_default_system(&bus);
1193         if (r < 0) {
1194                 log_error("Failed to open system bus: %s", strerror(-r));
1195                 return r;
1196         }
1197
1198         if (arg_keep_unit) {
1199                 r = sd_bus_call_method(
1200                                 bus,
1201                                 "org.freedesktop.machine1",
1202                                 "/org/freedesktop/machine1",
1203                                 "org.freedesktop.machine1.Manager",
1204                                 "RegisterMachine",
1205                                 &error,
1206                                 NULL,
1207                                 "sayssus",
1208                                 arg_machine,
1209                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1210                                 "nspawn",
1211                                 "container",
1212                                 (uint32_t) pid,
1213                                 strempty(arg_directory));
1214         } else {
1215                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1216
1217                 r = sd_bus_message_new_method_call(
1218                                 bus,
1219                                 &m,
1220                                 "org.freedesktop.machine1",
1221                                 "/org/freedesktop/machine1",
1222                                 "org.freedesktop.machine1.Manager",
1223                                 "CreateMachine");
1224                 if (r < 0) {
1225                         log_error("Failed to create message: %s", strerror(-r));
1226                         return r;
1227                 }
1228
1229                 r = sd_bus_message_append(
1230                                 m,
1231                                 "sayssus",
1232                                 arg_machine,
1233                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1234                                 "nspawn",
1235                                 "container",
1236                                 (uint32_t) pid,
1237                                 strempty(arg_directory));
1238                 if (r < 0) {
1239                         log_error("Failed to append message arguments: %s", strerror(-r));
1240                         return r;
1241                 }
1242
1243                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1244                 if (r < 0) {
1245                         log_error("Failed to open container: %s", strerror(-r));
1246                         return r;
1247                 }
1248
1249                 if (!isempty(arg_slice)) {
1250                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1251                         if (r < 0) {
1252                                 log_error("Failed to append slice: %s", strerror(-r));
1253                                 return r;
1254                         }
1255                 }
1256
1257                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1258                 if (r < 0) {
1259                         log_error("Failed to add device policy: %s", strerror(-r));
1260                         return r;
1261                 }
1262
1263                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1264                                           /* Allow the container to
1265                                            * access and create the API
1266                                            * device nodes, so that
1267                                            * PrivateDevices= in the
1268                                            * container can work
1269                                            * fine */
1270                                           "/dev/null", "rwm",
1271                                           "/dev/zero", "rwm",
1272                                           "/dev/full", "rwm",
1273                                           "/dev/random", "rwm",
1274                                           "/dev/urandom", "rwm",
1275                                           "/dev/tty", "rwm",
1276                                           /* Allow the container
1277                                            * access to ptys. However,
1278                                            * do not permit the
1279                                            * container to ever create
1280                                            * these device nodes. */
1281                                           "/dev/pts/ptmx", "rw",
1282                                           "char-pts", "rw",
1283                                           /* Allow the container
1284                                            * access to all kdbus
1285                                            * devices. Again, the
1286                                            * container cannot create
1287                                            * these nodes, only use
1288                                            * them. We use a pretty
1289                                            * open match here, so that
1290                                            * the kernel API can still
1291                                            * change. */
1292                                           "char-kdbus", "rw",
1293                                           "char-kdbus/*", "rw");
1294                 if (r < 0) {
1295                         log_error("Failed to add device whitelist: %s", strerror(-r));
1296                         return r;
1297                 }
1298
1299                 r = sd_bus_message_close_container(m);
1300                 if (r < 0) {
1301                         log_error("Failed to close container: %s", strerror(-r));
1302                         return r;
1303                 }
1304
1305                 r = sd_bus_call(bus, m, 0, &error, NULL);
1306         }
1307
1308         if (r < 0) {
1309                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1310                 return r;
1311         }
1312
1313         return 0;
1314 }
1315
1316 static int terminate_machine(pid_t pid) {
1317         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1318         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1319         _cleanup_bus_unref_ sd_bus *bus = NULL;
1320         const char *path;
1321         int r;
1322
1323         if (!arg_register)
1324                 return 0;
1325
1326         r = sd_bus_default_system(&bus);
1327         if (r < 0) {
1328                 log_error("Failed to open system bus: %s", strerror(-r));
1329                 return r;
1330         }
1331
1332         r = sd_bus_call_method(
1333                         bus,
1334                         "org.freedesktop.machine1",
1335                         "/org/freedesktop/machine1",
1336                         "org.freedesktop.machine1.Manager",
1337                         "GetMachineByPID",
1338                         &error,
1339                         &reply,
1340                         "u",
1341                         (uint32_t) pid);
1342         if (r < 0) {
1343                 /* Note that the machine might already have been
1344                  * cleaned up automatically, hence don't consider it a
1345                  * failure if we cannot get the machine object. */
1346                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1347                 return 0;
1348         }
1349
1350         r = sd_bus_message_read(reply, "o", &path);
1351         if (r < 0)
1352                 return bus_log_parse_error(r);
1353
1354         r = sd_bus_call_method(
1355                         bus,
1356                         "org.freedesktop.machine1",
1357                         path,
1358                         "org.freedesktop.machine1.Machine",
1359                         "Terminate",
1360                         &error,
1361                         NULL,
1362                         NULL);
1363         if (r < 0) {
1364                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1365                 return 0;
1366         }
1367
1368         return 0;
1369 }
1370
1371 static int reset_audit_loginuid(void) {
1372         _cleanup_free_ char *p = NULL;
1373         int r;
1374
1375         if (arg_share_system)
1376                 return 0;
1377
1378         r = read_one_line_file("/proc/self/loginuid", &p);
1379         if (r == -ENOENT)
1380                 return 0;
1381         if (r < 0) {
1382                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1383                 return r;
1384         }
1385
1386         /* Already reset? */
1387         if (streq(p, "4294967295"))
1388                 return 0;
1389
1390         r = write_string_file("/proc/self/loginuid", "4294967295");
1391         if (r < 0) {
1392                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1393                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1394                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1395                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1396                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1397
1398                 sleep(5);
1399         }
1400
1401         return 0;
1402 }
1403
1404 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1405
1406 static int get_mac(struct ether_addr *mac) {
1407         int r;
1408
1409         uint8_t result[8];
1410         size_t l, sz;
1411         uint8_t *v;
1412
1413         l = strlen(arg_machine);
1414         sz = sizeof(sd_id128_t) + l;
1415         v = alloca(sz);
1416
1417         /* fetch some persistent data unique to the host */
1418         r = sd_id128_get_machine((sd_id128_t*) v);
1419         if (r < 0)
1420                 return r;
1421
1422         /* combine with some data unique (on this host) to this
1423          * container instance */
1424         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1425
1426         /* Let's hash the host machine ID plus the container name. We
1427          * use a fixed, but originally randomly created hash key here. */
1428         siphash24(result, v, sz, HASH_KEY.bytes);
1429
1430         assert_cc(ETH_ALEN <= sizeof(result));
1431         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1432
1433         /* see eth_random_addr in the kernel */
1434         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1435         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1436
1437         return 0;
1438 }
1439
1440 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1441         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1442         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1443         struct ether_addr mac;
1444         int r;
1445
1446         if (!arg_private_network)
1447                 return 0;
1448
1449         if (!arg_network_veth)
1450                 return 0;
1451
1452         /* Use two different interface name prefixes depending whether
1453          * we are in bridge mode or not. */
1454         if (arg_network_bridge)
1455                 memcpy(iface_name, "vb-", 3);
1456         else
1457                 memcpy(iface_name, "ve-", 3);
1458         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1459
1460         r = get_mac(&mac);
1461         if (r < 0) {
1462                 log_error("Failed to generate predictable MAC address for host0");
1463                 return r;
1464         }
1465
1466         r = sd_rtnl_open(&rtnl, 0);
1467         if (r < 0) {
1468                 log_error("Failed to connect to netlink: %s", strerror(-r));
1469                 return r;
1470         }
1471
1472         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1473         if (r < 0) {
1474                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1475                 return r;
1476         }
1477
1478         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1479         if (r < 0) {
1480                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1481                 return r;
1482         }
1483
1484         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1485         if (r < 0) {
1486                 log_error("Failed to open netlink container: %s", strerror(-r));
1487                 return r;
1488         }
1489
1490         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1491         if (r < 0) {
1492                 log_error("Failed to open netlink container: %s", strerror(-r));
1493                 return r;
1494         }
1495
1496         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1497         if (r < 0) {
1498                 log_error("Failed to open netlink container: %s", strerror(-r));
1499                 return r;
1500         }
1501
1502         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1503         if (r < 0) {
1504                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1505                 return r;
1506         }
1507
1508         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1509         if (r < 0) {
1510                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1511                 return r;
1512         }
1513
1514         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1515         if (r < 0) {
1516                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1517                 return r;
1518         }
1519
1520         r = sd_rtnl_message_close_container(m);
1521         if (r < 0) {
1522                 log_error("Failed to close netlink container: %s", strerror(-r));
1523                 return r;
1524         }
1525
1526         r = sd_rtnl_message_close_container(m);
1527         if (r < 0) {
1528                 log_error("Failed to close netlink container: %s", strerror(-r));
1529                 return r;
1530         }
1531
1532         r = sd_rtnl_message_close_container(m);
1533         if (r < 0) {
1534                 log_error("Failed to close netlink container: %s", strerror(-r));
1535                 return r;
1536         }
1537
1538         r = sd_rtnl_call(rtnl, m, 0, NULL);
1539         if (r < 0) {
1540                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1541                 return r;
1542         }
1543
1544         return 0;
1545 }
1546
1547 static int setup_bridge(const char veth_name[]) {
1548         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1549         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1550         int r, bridge;
1551
1552         if (!arg_private_network)
1553                 return 0;
1554
1555         if (!arg_network_veth)
1556                 return 0;
1557
1558         if (!arg_network_bridge)
1559                 return 0;
1560
1561         bridge = (int) if_nametoindex(arg_network_bridge);
1562         if (bridge <= 0) {
1563                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1564                 return -errno;
1565         }
1566
1567         r = sd_rtnl_open(&rtnl, 0);
1568         if (r < 0) {
1569                 log_error("Failed to connect to netlink: %s", strerror(-r));
1570                 return r;
1571         }
1572
1573         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1574         if (r < 0) {
1575                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1576                 return r;
1577         }
1578
1579         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1580         if (r < 0) {
1581                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1582                 return r;
1583         }
1584
1585         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1586         if (r < 0) {
1587                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1588                 return r;
1589         }
1590
1591         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1592         if (r < 0) {
1593                 log_error("Failed to add netlink master field: %s", strerror(-r));
1594                 return r;
1595         }
1596
1597         r = sd_rtnl_call(rtnl, m, 0, NULL);
1598         if (r < 0) {
1599                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1600                 return r;
1601         }
1602
1603         return 0;
1604 }
1605
1606 static int parse_interface(struct udev *udev, const char *name) {
1607         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1608         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1609         int ifi;
1610
1611         ifi = (int) if_nametoindex(name);
1612         if (ifi <= 0) {
1613                 log_error("Failed to resolve interface %s: %m", name);
1614                 return -errno;
1615         }
1616
1617         sprintf(ifi_str, "n%i", ifi);
1618         d = udev_device_new_from_device_id(udev, ifi_str);
1619         if (!d) {
1620                 log_error("Failed to get udev device for interface %s: %m", name);
1621                 return -errno;
1622         }
1623
1624         if (udev_device_get_is_initialized(d) <= 0) {
1625                 log_error("Network interface %s is not initialized yet.", name);
1626                 return -EBUSY;
1627         }
1628
1629         return ifi;
1630 }
1631
1632 static int move_network_interfaces(pid_t pid) {
1633         _cleanup_udev_unref_ struct udev *udev = NULL;
1634         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1635         char **i;
1636         int r;
1637
1638         if (!arg_private_network)
1639                 return 0;
1640
1641         if (strv_isempty(arg_network_interfaces))
1642                 return 0;
1643
1644         r = sd_rtnl_open(&rtnl, 0);
1645         if (r < 0) {
1646                 log_error("Failed to connect to netlink: %s", strerror(-r));
1647                 return r;
1648         }
1649
1650         udev = udev_new();
1651         if (!udev) {
1652                 log_error("Failed to connect to udev.");
1653                 return -ENOMEM;
1654         }
1655
1656         STRV_FOREACH(i, arg_network_interfaces) {
1657                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1658                 int ifi;
1659
1660                 ifi = parse_interface(udev, *i);
1661                 if (ifi < 0)
1662                         return ifi;
1663
1664                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1665                 if (r < 0) {
1666                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1667                         return r;
1668                 }
1669
1670                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1671                 if (r < 0) {
1672                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1673                         return r;
1674                 }
1675
1676                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1677                 if (r < 0) {
1678                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1679                         return r;
1680                 }
1681         }
1682
1683         return 0;
1684 }
1685
1686 static int setup_macvlan(pid_t pid) {
1687         _cleanup_udev_unref_ struct udev *udev = NULL;
1688         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1689         char **i;
1690         int r;
1691
1692         if (!arg_private_network)
1693                 return 0;
1694
1695         if (strv_isempty(arg_network_macvlan))
1696                 return 0;
1697
1698         r = sd_rtnl_open(&rtnl, 0);
1699         if (r < 0) {
1700                 log_error("Failed to connect to netlink: %s", strerror(-r));
1701                 return r;
1702         }
1703
1704         udev = udev_new();
1705         if (!udev) {
1706                 log_error("Failed to connect to udev.");
1707                 return -ENOMEM;
1708         }
1709
1710         STRV_FOREACH(i, arg_network_macvlan) {
1711                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1712                 _cleanup_free_ char *n = NULL;
1713                 int ifi;
1714
1715                 ifi = parse_interface(udev, *i);
1716                 if (ifi < 0)
1717                         return ifi;
1718
1719                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1720                 if (r < 0) {
1721                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1722                         return r;
1723                 }
1724
1725                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1726                 if (r < 0) {
1727                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1728                         return r;
1729                 }
1730
1731                 n = strappend("mv-", *i);
1732                 if (!n)
1733                         return log_oom();
1734
1735                 strshorten(n, IFNAMSIZ-1);
1736
1737                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1738                 if (r < 0) {
1739                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1740                         return r;
1741                 }
1742
1743                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1744                 if (r < 0) {
1745                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1746                         return r;
1747                 }
1748
1749                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1750                 if (r < 0) {
1751                         log_error("Failed to open netlink container: %s", strerror(-r));
1752                         return r;
1753                 }
1754
1755                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1756                 if (r < 0) {
1757                         log_error("Failed to open netlink container: %s", strerror(-r));
1758                         return r;
1759                 }
1760
1761                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1762                 if (r < 0) {
1763                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1764                         return r;
1765                 }
1766
1767                 r = sd_rtnl_message_close_container(m);
1768                 if (r < 0) {
1769                         log_error("Failed to close netlink container: %s", strerror(-r));
1770                         return r;
1771                 }
1772
1773                 r = sd_rtnl_message_close_container(m);
1774                 if (r < 0) {
1775                         log_error("Failed to close netlink container: %s", strerror(-r));
1776                         return r;
1777                 }
1778
1779                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1780                 if (r < 0) {
1781                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1782                         return r;
1783                 }
1784         }
1785
1786         return 0;
1787 }
1788
1789 static int audit_still_doesnt_work_in_containers(void) {
1790
1791 #ifdef HAVE_SECCOMP
1792         scmp_filter_ctx seccomp;
1793         int r;
1794
1795         /*
1796            Audit is broken in containers, much of the userspace audit
1797            hookup will fail if running inside a container. We don't
1798            care and just turn off creation of audit sockets.
1799
1800            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1801            with EAFNOSUPPORT which audit userspace uses as indication
1802            that audit is disabled in the kernel.
1803          */
1804
1805         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1806         if (!seccomp)
1807                 return log_oom();
1808
1809         r = seccomp_add_secondary_archs(seccomp);
1810         if (r < 0) {
1811                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1812                 goto finish;
1813         }
1814
1815         r = seccomp_rule_add(
1816                         seccomp,
1817                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1818                         SCMP_SYS(socket),
1819                         2,
1820                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1821                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1822         if (r < 0) {
1823                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1824                 goto finish;
1825         }
1826
1827         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1828         if (r < 0) {
1829                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1830                 goto finish;
1831         }
1832
1833         r = seccomp_load(seccomp);
1834         if (r < 0)
1835                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1836
1837 finish:
1838         seccomp_release(seccomp);
1839         return r;
1840 #else
1841         return 0;
1842 #endif
1843
1844 }
1845
1846 static int setup_image(char **device_path, int *loop_nr) {
1847         struct loop_info64 info = {
1848                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1849         };
1850         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1851         _cleanup_free_ char* loopdev = NULL;
1852         struct stat st;
1853         int r, nr;
1854
1855         assert(device_path);
1856         assert(loop_nr);
1857
1858         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1859         if (fd < 0) {
1860                 log_error("Failed to open %s: %m", arg_image);
1861                 return -errno;
1862         }
1863
1864         if (fstat(fd, &st) < 0) {
1865                 log_error("Failed to stat %s: %m", arg_image);
1866                 return -errno;
1867         }
1868
1869         if (S_ISBLK(st.st_mode)) {
1870                 char *p;
1871
1872                 p = strdup(arg_image);
1873                 if (!p)
1874                         return log_oom();
1875
1876                 *device_path = p;
1877
1878                 *loop_nr = -1;
1879
1880                 r = fd;
1881                 fd = -1;
1882
1883                 return r;
1884         }
1885
1886         if (!S_ISREG(st.st_mode)) {
1887                 log_error("%s is not a regular file or block device: %m", arg_image);
1888                 return -EINVAL;
1889         }
1890
1891         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1892         if (control < 0) {
1893                 log_error("Failed to open /dev/loop-control: %m");
1894                 return -errno;
1895         }
1896
1897         nr = ioctl(control, LOOP_CTL_GET_FREE);
1898         if (nr < 0) {
1899                 log_error("Failed to allocate loop device: %m");
1900                 return -errno;
1901         }
1902
1903         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1904                 return log_oom();
1905
1906         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1907         if (loop < 0) {
1908                 log_error("Failed to open loop device %s: %m", loopdev);
1909                 return -errno;
1910         }
1911
1912         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1913                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1914                 return -errno;
1915         }
1916
1917         if (arg_read_only)
1918                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1919
1920         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1921                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1922                 return -errno;
1923         }
1924
1925         *device_path = loopdev;
1926         loopdev = NULL;
1927
1928         *loop_nr = nr;
1929
1930         r = loop;
1931         loop = -1;
1932
1933         return r;
1934 }
1935
1936 static int dissect_image(
1937                 int fd,
1938                 char **root_device, bool *root_device_rw,
1939                 char **home_device, bool *home_device_rw,
1940                 char **srv_device, bool *srv_device_rw,
1941                 bool *secondary) {
1942
1943 #ifdef HAVE_BLKID
1944         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1945         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1946         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1947         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1948         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1949         _cleanup_udev_unref_ struct udev *udev = NULL;
1950         struct udev_list_entry *first, *item;
1951         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1952         const char *pttype = NULL;
1953         blkid_partlist pl;
1954         struct stat st;
1955         int r;
1956
1957         assert(fd >= 0);
1958         assert(root_device);
1959         assert(home_device);
1960         assert(srv_device);
1961         assert(secondary);
1962
1963         b = blkid_new_probe();
1964         if (!b)
1965                 return log_oom();
1966
1967         errno = 0;
1968         r = blkid_probe_set_device(b, fd, 0, 0);
1969         if (r != 0) {
1970                 if (errno == 0)
1971                         return log_oom();
1972
1973                 log_error("Failed to set device on blkid probe: %m");
1974                 return -errno;
1975         }
1976
1977         blkid_probe_enable_partitions(b, 1);
1978         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1979
1980         errno = 0;
1981         r = blkid_do_safeprobe(b);
1982         if (r == -2 || r == 1) {
1983                 log_error("Failed to identify any partition table on %s.\n"
1984                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1985                 return -EINVAL;
1986         } else if (r != 0) {
1987                 if (errno == 0)
1988                         errno = EIO;
1989                 log_error("Failed to probe: %m");
1990                 return -errno;
1991         }
1992
1993         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1994         if (!streq_ptr(pttype, "gpt")) {
1995                 log_error("Image %s does not carry a GUID Partition Table.\n"
1996                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1997                 return -EINVAL;
1998         }
1999
2000         errno = 0;
2001         pl = blkid_probe_get_partitions(b);
2002         if (!pl) {
2003                 if (errno == 0)
2004                         return log_oom();
2005
2006                 log_error("Failed to list partitions of %s", arg_image);
2007                 return -errno;
2008         }
2009
2010         udev = udev_new();
2011         if (!udev)
2012                 return log_oom();
2013
2014         if (fstat(fd, &st) < 0) {
2015                 log_error("Failed to stat block device: %m");
2016                 return -errno;
2017         }
2018
2019         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2020         if (!d)
2021                 return log_oom();
2022
2023         e = udev_enumerate_new(udev);
2024         if (!e)
2025                 return log_oom();
2026
2027         r = udev_enumerate_add_match_parent(e, d);
2028         if (r < 0)
2029                 return log_oom();
2030
2031         r = udev_enumerate_scan_devices(e);
2032         if (r < 0) {
2033                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2034                 return r;
2035         }
2036
2037         first = udev_enumerate_get_list_entry(e);
2038         udev_list_entry_foreach(item, first) {
2039                 _cleanup_udev_device_unref_ struct udev_device *q;
2040                 const char *stype, *node;
2041                 unsigned long long flags;
2042                 sd_id128_t type_id;
2043                 blkid_partition pp;
2044                 dev_t qn;
2045                 int nr;
2046
2047                 errno = 0;
2048                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2049                 if (!q) {
2050                         if (!errno)
2051                                 errno = ENOMEM;
2052
2053                         log_error("Failed to get partition device of %s: %m", arg_image);
2054                         return -errno;
2055                 }
2056
2057                 qn = udev_device_get_devnum(q);
2058                 if (major(qn) == 0)
2059                         continue;
2060
2061                 if (st.st_rdev == qn)
2062                         continue;
2063
2064                 node = udev_device_get_devnode(q);
2065                 if (!node)
2066                         continue;
2067
2068                 pp = blkid_partlist_devno_to_partition(pl, qn);
2069                 if (!pp)
2070                         continue;
2071
2072                 flags = blkid_partition_get_flags(pp);
2073                 if (flags & GPT_FLAG_NO_AUTO)
2074                         continue;
2075
2076                 nr = blkid_partition_get_partno(pp);
2077                 if (nr < 0)
2078                         continue;
2079
2080                 stype = blkid_partition_get_type_string(pp);
2081                 if (!stype)
2082                         continue;
2083
2084                 if (sd_id128_from_string(stype, &type_id) < 0)
2085                         continue;
2086
2087                 if (sd_id128_equal(type_id, GPT_HOME)) {
2088
2089                         if (home && nr >= home_nr)
2090                                 continue;
2091
2092                         home_nr = nr;
2093                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2094
2095                         free(home);
2096                         home = strdup(node);
2097                         if (!home)
2098                                 return log_oom();
2099                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2100
2101                         if (srv && nr >= srv_nr)
2102                                 continue;
2103
2104                         srv_nr = nr;
2105                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2106
2107                         free(srv);
2108                         srv = strdup(node);
2109                         if (!srv)
2110                                 return log_oom();
2111                 }
2112 #ifdef GPT_ROOT_NATIVE
2113                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2114
2115                         if (root && nr >= root_nr)
2116                                 continue;
2117
2118                         root_nr = nr;
2119                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2120
2121                         free(root);
2122                         root = strdup(node);
2123                         if (!root)
2124                                 return log_oom();
2125                 }
2126 #endif
2127 #ifdef GPT_ROOT_SECONDARY
2128                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2129
2130                         if (secondary_root && nr >= secondary_root_nr)
2131                                 continue;
2132
2133                         secondary_root_nr = nr;
2134                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2135
2136
2137                         free(secondary_root);
2138                         secondary_root = strdup(node);
2139                         if (!secondary_root)
2140                                 return log_oom();
2141                 }
2142 #endif
2143         }
2144
2145         if (!root && !secondary_root) {
2146                 log_error("Failed to identify root partition in disk image %s.\n"
2147                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2148                 return -EINVAL;
2149         }
2150
2151         if (root) {
2152                 *root_device = root;
2153                 root = NULL;
2154
2155                 *root_device_rw = root_rw;
2156                 *secondary = false;
2157         } else if (secondary_root) {
2158                 *root_device = secondary_root;
2159                 secondary_root = NULL;
2160
2161                 *root_device_rw = secondary_root_rw;
2162                 *secondary = true;
2163         }
2164
2165         if (home) {
2166                 *home_device = home;
2167                 home = NULL;
2168
2169                 *home_device_rw = home_rw;
2170         }
2171
2172         if (srv) {
2173                 *srv_device = srv;
2174                 srv = NULL;
2175
2176                 *srv_device_rw = srv_rw;
2177         }
2178
2179         return 0;
2180 #else
2181         log_error("--image= is not supported, compiled without blkid support.");
2182         return -ENOTSUP;
2183 #endif
2184 }
2185
2186 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2187 #ifdef HAVE_BLKID
2188         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2189         const char *fstype, *p;
2190         int r;
2191
2192         assert(what);
2193         assert(where);
2194
2195         if (arg_read_only)
2196                 rw = false;
2197
2198         if (directory)
2199                 p = strappenda(where, directory);
2200         else
2201                 p = where;
2202
2203         errno = 0;
2204         b = blkid_new_probe_from_filename(what);
2205         if (!b) {
2206                 if (errno == 0)
2207                         return log_oom();
2208                 log_error("Failed to allocate prober for %s: %m", what);
2209                 return -errno;
2210         }
2211
2212         blkid_probe_enable_superblocks(b, 1);
2213         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2214
2215         errno = 0;
2216         r = blkid_do_safeprobe(b);
2217         if (r == -1 || r == 1) {
2218                 log_error("Cannot determine file system type of %s", what);
2219                 return -EINVAL;
2220         } else if (r != 0) {
2221                 if (errno == 0)
2222                         errno = EIO;
2223                 log_error("Failed to probe %s: %m", what);
2224                 return -errno;
2225         }
2226
2227         errno = 0;
2228         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2229                 if (errno == 0)
2230                         errno = EINVAL;
2231                 log_error("Failed to determine file system type of %s", what);
2232                 return -errno;
2233         }
2234
2235         if (streq(fstype, "crypto_LUKS")) {
2236                 log_error("nspawn currently does not support LUKS disk images.");
2237                 return -ENOTSUP;
2238         }
2239
2240         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2241                 log_error("Failed to mount %s: %m", what);
2242                 return -errno;
2243         }
2244
2245         return 0;
2246 #else
2247         log_error("--image= is not supported, compiled without blkid support.");
2248         return -ENOTSUP;
2249 #endif
2250 }
2251
2252 static int mount_devices(
2253                 const char *where,
2254                 const char *root_device, bool root_device_rw,
2255                 const char *home_device, bool home_device_rw,
2256                 const char *srv_device, bool srv_device_rw) {
2257         int r;
2258
2259         assert(where);
2260
2261         if (root_device) {
2262                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2263                 if (r < 0) {
2264                         log_error("Failed to mount root directory: %s", strerror(-r));
2265                         return r;
2266                 }
2267         }
2268
2269         if (home_device) {
2270                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2271                 if (r < 0) {
2272                         log_error("Failed to mount home directory: %s", strerror(-r));
2273                         return r;
2274                 }
2275         }
2276
2277         if (srv_device) {
2278                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2279                 if (r < 0) {
2280                         log_error("Failed to mount server data directory: %s", strerror(-r));
2281                         return r;
2282                 }
2283         }
2284
2285         return 0;
2286 }
2287
2288 static void loop_remove(int nr, int *image_fd) {
2289         _cleanup_close_ int control = -1;
2290
2291         if (nr < 0)
2292                 return;
2293
2294         if (image_fd && *image_fd >= 0) {
2295                 ioctl(*image_fd, LOOP_CLR_FD);
2296                 *image_fd = safe_close(*image_fd);
2297         }
2298
2299         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2300         if (control < 0)
2301                 return;
2302
2303         ioctl(control, LOOP_CTL_REMOVE, nr);
2304 }
2305
2306 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2307         int pipe_fds[2];
2308         pid_t pid;
2309
2310         assert(database);
2311         assert(key);
2312         assert(rpid);
2313
2314         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2315                 log_error("Failed to allocate pipe: %m");
2316                 return -errno;
2317         }
2318
2319         pid = fork();
2320         if (pid < 0) {
2321                 log_error("Failed to fork getent child: %m");
2322                 return -errno;
2323         } else if (pid == 0) {
2324                 int nullfd;
2325                 char *empty_env = NULL;
2326
2327                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2328                         _exit(EXIT_FAILURE);
2329
2330                 if (pipe_fds[0] > 2)
2331                         safe_close(pipe_fds[0]);
2332                 if (pipe_fds[1] > 2)
2333                         safe_close(pipe_fds[1]);
2334
2335                 nullfd = open("/dev/null", O_RDWR);
2336                 if (nullfd < 0)
2337                         _exit(EXIT_FAILURE);
2338
2339                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2340                         _exit(EXIT_FAILURE);
2341
2342                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2343                         _exit(EXIT_FAILURE);
2344
2345                 if (nullfd > 2)
2346                         safe_close(nullfd);
2347
2348                 reset_all_signal_handlers();
2349                 close_all_fds(NULL, 0);
2350
2351                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2352                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2353                 _exit(EXIT_FAILURE);
2354         }
2355
2356         pipe_fds[1] = safe_close(pipe_fds[1]);
2357
2358         *rpid = pid;
2359
2360         return pipe_fds[0];
2361 }
2362
2363 static int change_uid_gid(char **_home) {
2364         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2365         _cleanup_free_ uid_t *uids = NULL;
2366         _cleanup_free_ char *home = NULL;
2367         _cleanup_fclose_ FILE *f = NULL;
2368         _cleanup_close_ int fd = -1;
2369         unsigned n_uids = 0;
2370         size_t sz = 0, l;
2371         uid_t uid;
2372         gid_t gid;
2373         pid_t pid;
2374         int r;
2375
2376         assert(_home);
2377
2378         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2379                 /* Reset everything fully to 0, just in case */
2380
2381                 if (setgroups(0, NULL) < 0) {
2382                         log_error("setgroups() failed: %m");
2383                         return -errno;
2384                 }
2385
2386                 if (setresgid(0, 0, 0) < 0) {
2387                         log_error("setregid() failed: %m");
2388                         return -errno;
2389                 }
2390
2391                 if (setresuid(0, 0, 0) < 0) {
2392                         log_error("setreuid() failed: %m");
2393                         return -errno;
2394                 }
2395
2396                 *_home = NULL;
2397                 return 0;
2398         }
2399
2400         /* First, get user credentials */
2401         fd = spawn_getent("passwd", arg_user, &pid);
2402         if (fd < 0)
2403                 return fd;
2404
2405         f = fdopen(fd, "r");
2406         if (!f)
2407                 return log_oom();
2408         fd = -1;
2409
2410         if (!fgets(line, sizeof(line), f)) {
2411
2412                 if (!ferror(f)) {
2413                         log_error("Failed to resolve user %s.", arg_user);
2414                         return -ESRCH;
2415                 }
2416
2417                 log_error("Failed to read from getent: %m");
2418                 return -errno;
2419         }
2420
2421         truncate_nl(line);
2422
2423         wait_for_terminate_and_warn("getent passwd", pid);
2424
2425         x = strchr(line, ':');
2426         if (!x) {
2427                 log_error("/etc/passwd entry has invalid user field.");
2428                 return -EIO;
2429         }
2430
2431         u = strchr(x+1, ':');
2432         if (!u) {
2433                 log_error("/etc/passwd entry has invalid password field.");
2434                 return -EIO;
2435         }
2436
2437         u++;
2438         g = strchr(u, ':');
2439         if (!g) {
2440                 log_error("/etc/passwd entry has invalid UID field.");
2441                 return -EIO;
2442         }
2443
2444         *g = 0;
2445         g++;
2446         x = strchr(g, ':');
2447         if (!x) {
2448                 log_error("/etc/passwd entry has invalid GID field.");
2449                 return -EIO;
2450         }
2451
2452         *x = 0;
2453         h = strchr(x+1, ':');
2454         if (!h) {
2455                 log_error("/etc/passwd entry has invalid GECOS field.");
2456                 return -EIO;
2457         }
2458
2459         h++;
2460         x = strchr(h, ':');
2461         if (!x) {
2462                 log_error("/etc/passwd entry has invalid home directory field.");
2463                 return -EIO;
2464         }
2465
2466         *x = 0;
2467
2468         r = parse_uid(u, &uid);
2469         if (r < 0) {
2470                 log_error("Failed to parse UID of user.");
2471                 return -EIO;
2472         }
2473
2474         r = parse_gid(g, &gid);
2475         if (r < 0) {
2476                 log_error("Failed to parse GID of user.");
2477                 return -EIO;
2478         }
2479
2480         home = strdup(h);
2481         if (!home)
2482                 return log_oom();
2483
2484         /* Second, get group memberships */
2485         fd = spawn_getent("initgroups", arg_user, &pid);
2486         if (fd < 0)
2487                 return fd;
2488
2489         fclose(f);
2490         f = fdopen(fd, "r");
2491         if (!f)
2492                 return log_oom();
2493         fd = -1;
2494
2495         if (!fgets(line, sizeof(line), f)) {
2496                 if (!ferror(f)) {
2497                         log_error("Failed to resolve user %s.", arg_user);
2498                         return -ESRCH;
2499                 }
2500
2501                 log_error("Failed to read from getent: %m");
2502                 return -errno;
2503         }
2504
2505         truncate_nl(line);
2506
2507         wait_for_terminate_and_warn("getent initgroups", pid);
2508
2509         /* Skip over the username and subsequent separator whitespace */
2510         x = line;
2511         x += strcspn(x, WHITESPACE);
2512         x += strspn(x, WHITESPACE);
2513
2514         FOREACH_WORD(w, l, x, state) {
2515                 char c[l+1];
2516
2517                 memcpy(c, w, l);
2518                 c[l] = 0;
2519
2520                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2521                         return log_oom();
2522
2523                 r = parse_uid(c, &uids[n_uids++]);
2524                 if (r < 0) {
2525                         log_error("Failed to parse group data from getent.");
2526                         return -EIO;
2527                 }
2528         }
2529
2530         r = mkdir_parents(home, 0775);
2531         if (r < 0) {
2532                 log_error("Failed to make home root directory: %s", strerror(-r));
2533                 return r;
2534         }
2535
2536         r = mkdir_safe(home, 0755, uid, gid);
2537         if (r < 0 && r != -EEXIST) {
2538                 log_error("Failed to make home directory: %s", strerror(-r));
2539                 return r;
2540         }
2541
2542         fchown(STDIN_FILENO, uid, gid);
2543         fchown(STDOUT_FILENO, uid, gid);
2544         fchown(STDERR_FILENO, uid, gid);
2545
2546         if (setgroups(n_uids, uids) < 0) {
2547                 log_error("Failed to set auxiliary groups: %m");
2548                 return -errno;
2549         }
2550
2551         if (setresgid(gid, gid, gid) < 0) {
2552                 log_error("setregid() failed: %m");
2553                 return -errno;
2554         }
2555
2556         if (setresuid(uid, uid, uid) < 0) {
2557                 log_error("setreuid() failed: %m");
2558                 return -errno;
2559         }
2560
2561         if (_home) {
2562                 *_home = home;
2563                 home = NULL;
2564         }
2565
2566         return 0;
2567 }
2568
2569 int main(int argc, char *argv[]) {
2570
2571         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2572         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2573         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2574         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2575         _cleanup_fdset_free_ FDSet *fds = NULL;
2576         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2577         const char *console = NULL;
2578         char veth_name[IFNAMSIZ];
2579         bool secondary = false;
2580         pid_t pid = 0;
2581         sigset_t mask;
2582
2583         log_parse_environment();
2584         log_open();
2585
2586         k = parse_argv(argc, argv);
2587         if (k < 0)
2588                 goto finish;
2589         else if (k == 0) {
2590                 r = EXIT_SUCCESS;
2591                 goto finish;
2592         }
2593
2594         if (!arg_image) {
2595                 if (arg_directory) {
2596                         char *p;
2597
2598                         p = path_make_absolute_cwd(arg_directory);
2599                         free(arg_directory);
2600                         arg_directory = p;
2601                 } else
2602                         arg_directory = get_current_dir_name();
2603
2604                 if (!arg_directory) {
2605                         log_error("Failed to determine path, please use -D.");
2606                         goto finish;
2607                 }
2608                 path_kill_slashes(arg_directory);
2609         }
2610
2611         if (!arg_machine) {
2612                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2613                 if (!arg_machine) {
2614                         log_oom();
2615                         goto finish;
2616                 }
2617
2618                 hostname_cleanup(arg_machine, false);
2619                 if (isempty(arg_machine)) {
2620                         log_error("Failed to determine machine name automatically, please use -M.");
2621                         goto finish;
2622                 }
2623         }
2624
2625         if (geteuid() != 0) {
2626                 log_error("Need to be root.");
2627                 goto finish;
2628         }
2629
2630         if (sd_booted() <= 0) {
2631                 log_error("Not running on a systemd system.");
2632                 goto finish;
2633         }
2634
2635         log_close();
2636         n_fd_passed = sd_listen_fds(false);
2637         if (n_fd_passed > 0) {
2638                 k = fdset_new_listen_fds(&fds, false);
2639                 if (k < 0) {
2640                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2641                         goto finish;
2642                 }
2643         }
2644         fdset_close_others(fds);
2645         log_open();
2646
2647         if (arg_directory) {
2648                 if (path_equal(arg_directory, "/")) {
2649                         log_error("Spawning container on root directory not supported.");
2650                         goto finish;
2651                 }
2652
2653                 if (arg_boot) {
2654                         if (path_is_os_tree(arg_directory) <= 0) {
2655                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2656                                 goto finish;
2657                         }
2658                 } else {
2659                         const char *p;
2660
2661                         p = strappenda(arg_directory,
2662                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2663                         if (access(p, F_OK) < 0) {
2664                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2665                                 goto finish;
2666
2667                         }
2668                 }
2669         } else {
2670                 char template[] = "/tmp/nspawn-root-XXXXXX";
2671
2672                 if (!mkdtemp(template)) {
2673                         log_error("Failed to create temporary directory: %m");
2674                         r = -errno;
2675                         goto finish;
2676                 }
2677
2678                 arg_directory = strdup(template);
2679                 if (!arg_directory) {
2680                         r = log_oom();
2681                         goto finish;
2682                 }
2683
2684                 image_fd = setup_image(&device_path, &loop_nr);
2685                 if (image_fd < 0) {
2686                         r = image_fd;
2687                         goto finish;
2688                 }
2689
2690                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2691                 if (r < 0)
2692                         goto finish;
2693         }
2694
2695         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2696         if (master < 0) {
2697                 log_error("Failed to acquire pseudo tty: %m");
2698                 goto finish;
2699         }
2700
2701         console = ptsname(master);
2702         if (!console) {
2703                 log_error("Failed to determine tty name: %m");
2704                 goto finish;
2705         }
2706
2707         if (!arg_quiet)
2708                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2709
2710         if (unlockpt(master) < 0) {
2711                 log_error("Failed to unlock tty: %m");
2712                 goto finish;
2713         }
2714
2715         if (access("/dev/kdbus/control", F_OK) >= 0) {
2716
2717                 if (arg_share_system) {
2718                         kdbus_domain = strdup("/dev/kdbus");
2719                         if (!kdbus_domain) {
2720                                 log_oom();
2721                                 goto finish;
2722                         }
2723                 } else {
2724                         const char *ns;
2725
2726                         ns = strappenda("machine-", arg_machine);
2727                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2728                         if (r < 0)
2729                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2730                         else
2731                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2732                 }
2733         }
2734
2735         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2736                 log_error("Failed to create kmsg socket pair: %m");
2737                 goto finish;
2738         }
2739
2740         sd_notify(0, "READY=1");
2741
2742         assert_se(sigemptyset(&mask) == 0);
2743         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2744         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2745
2746         for (;;) {
2747                 int parent_ready_fd = -1, child_ready_fd = -1;
2748                 siginfo_t status;
2749                 eventfd_t x;
2750
2751                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2752                 if (parent_ready_fd < 0) {
2753                         log_error("Failed to create event fd: %m");
2754                         goto finish;
2755                 }
2756
2757                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2758                 if (child_ready_fd < 0) {
2759                         log_error("Failed to create event fd: %m");
2760                         goto finish;
2761                 }
2762
2763                 pid = syscall(__NR_clone,
2764                               SIGCHLD|CLONE_NEWNS|
2765                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2766                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2767                 if (pid < 0) {
2768                         if (errno == EINVAL)
2769                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2770                         else
2771                                 log_error("clone() failed: %m");
2772
2773                         goto finish;
2774                 }
2775
2776                 if (pid == 0) {
2777                         /* child */
2778                         _cleanup_free_ char *home = NULL;
2779                         unsigned n_env = 2;
2780                         const char *envp[] = {
2781                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2782                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2783                                 NULL, /* TERM */
2784                                 NULL, /* HOME */
2785                                 NULL, /* USER */
2786                                 NULL, /* LOGNAME */
2787                                 NULL, /* container_uuid */
2788                                 NULL, /* LISTEN_FDS */
2789                                 NULL, /* LISTEN_PID */
2790                                 NULL
2791                         };
2792                         char **env_use;
2793
2794                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2795                         if (envp[n_env])
2796                                 n_env ++;
2797
2798                         master = safe_close(master);
2799
2800                         close_nointr(STDIN_FILENO);
2801                         close_nointr(STDOUT_FILENO);
2802                         close_nointr(STDERR_FILENO);
2803
2804                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2805
2806                         reset_all_signal_handlers();
2807
2808                         assert_se(sigemptyset(&mask) == 0);
2809                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2810
2811                         k = open_terminal(console, O_RDWR);
2812                         if (k != STDIN_FILENO) {
2813                                 if (k >= 0) {
2814                                         safe_close(k);
2815                                         k = -EINVAL;
2816                                 }
2817
2818                                 log_error("Failed to open console: %s", strerror(-k));
2819                                 goto child_fail;
2820                         }
2821
2822                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2823                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2824                                 log_error("Failed to duplicate console: %m");
2825                                 goto child_fail;
2826                         }
2827
2828                         if (setsid() < 0) {
2829                                 log_error("setsid() failed: %m");
2830                                 goto child_fail;
2831                         }
2832
2833                         if (reset_audit_loginuid() < 0)
2834                                 goto child_fail;
2835
2836                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2837                                 log_error("PR_SET_PDEATHSIG failed: %m");
2838                                 goto child_fail;
2839                         }
2840
2841                         /* Mark everything as slave, so that we still
2842                          * receive mounts from the real root, but don't
2843                          * propagate mounts to the real root. */
2844                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2845                                 log_error("MS_SLAVE|MS_REC failed: %m");
2846                                 goto child_fail;
2847                         }
2848
2849                         if (mount_devices(arg_directory,
2850                                           root_device, root_device_rw,
2851                                           home_device, home_device_rw,
2852                                           srv_device, srv_device_rw) < 0)
2853                                 goto child_fail;
2854
2855                         /* Turn directory into bind mount */
2856                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2857                                 log_error("Failed to make bind mount.");
2858                                 goto child_fail;
2859                         }
2860
2861                         if (arg_read_only)
2862                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2863                                         log_error("Failed to make read-only.");
2864                                         goto child_fail;
2865                                 }
2866
2867                         if (mount_all(arg_directory) < 0)
2868                                 goto child_fail;
2869
2870                         if (copy_devnodes(arg_directory) < 0)
2871                                 goto child_fail;
2872
2873                         if (setup_ptmx(arg_directory) < 0)
2874                                 goto child_fail;
2875
2876                         dev_setup(arg_directory);
2877
2878                         if (audit_still_doesnt_work_in_containers() < 0)
2879                                 goto child_fail;
2880
2881                         if (setup_dev_console(arg_directory, console) < 0)
2882                                 goto child_fail;
2883
2884                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2885                                 goto child_fail;
2886
2887                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2888
2889                         if (setup_boot_id(arg_directory) < 0)
2890                                 goto child_fail;
2891
2892                         if (setup_timezone(arg_directory) < 0)
2893                                 goto child_fail;
2894
2895                         if (setup_resolv_conf(arg_directory) < 0)
2896                                 goto child_fail;
2897
2898                         if (setup_journal(arg_directory) < 0)
2899                                 goto child_fail;
2900
2901                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2902                                 goto child_fail;
2903
2904                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2905                                 goto child_fail;
2906
2907                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2908                                 goto child_fail;
2909
2910                         /* Tell the parent that we are ready, and that
2911                          * it can cgroupify us to that we lack access
2912                          * to certain devices and resources. */
2913                         eventfd_write(child_ready_fd, 1);
2914                         child_ready_fd = safe_close(child_ready_fd);
2915
2916                         if (chdir(arg_directory) < 0) {
2917                                 log_error("chdir(%s) failed: %m", arg_directory);
2918                                 goto child_fail;
2919                         }
2920
2921                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2922                                 log_error("mount(MS_MOVE) failed: %m");
2923                                 goto child_fail;
2924                         }
2925
2926                         if (chroot(".") < 0) {
2927                                 log_error("chroot() failed: %m");
2928                                 goto child_fail;
2929                         }
2930
2931                         if (chdir("/") < 0) {
2932                                 log_error("chdir() failed: %m");
2933                                 goto child_fail;
2934                         }
2935
2936                         umask(0022);
2937
2938                         if (arg_private_network)
2939                                 loopback_setup();
2940
2941                         if (drop_capabilities() < 0) {
2942                                 log_error("drop_capabilities() failed: %m");
2943                                 goto child_fail;
2944                         }
2945
2946                         r = change_uid_gid(&home);
2947                         if (r < 0)
2948                                 goto child_fail;
2949
2950                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2951                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2952                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2953                                 log_oom();
2954                                 goto child_fail;
2955                         }
2956
2957                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2958                                 char as_uuid[37];
2959
2960                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
2961                                         log_oom();
2962                                         goto child_fail;
2963                                 }
2964                         }
2965
2966                         if (fdset_size(fds) > 0) {
2967                                 k = fdset_cloexec(fds, false);
2968                                 if (k < 0) {
2969                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2970                                         goto child_fail;
2971                                 }
2972
2973                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2974                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2975                                         log_oom();
2976                                         goto child_fail;
2977                                 }
2978                         }
2979
2980                         setup_hostname();
2981
2982                         if (arg_personality != 0xffffffffLU) {
2983                                 if (personality(arg_personality) < 0) {
2984                                         log_error("personality() failed: %m");
2985                                         goto child_fail;
2986                                 }
2987                         } else if (secondary) {
2988                                 if (personality(PER_LINUX32) < 0) {
2989                                         log_error("personality() failed: %m");
2990                                         goto child_fail;
2991                                 }
2992                         }
2993
2994 #ifdef HAVE_SELINUX
2995                         if (arg_selinux_context)
2996                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
2997                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2998                                         goto child_fail;
2999                                 }
3000 #endif
3001
3002                         if (!strv_isempty(arg_setenv)) {
3003                                 char **n;
3004
3005                                 n = strv_env_merge(2, envp, arg_setenv);
3006                                 if (!n) {
3007                                         log_oom();
3008                                         goto child_fail;
3009                                 }
3010
3011                                 env_use = n;
3012                         } else
3013                                 env_use = (char**) envp;
3014
3015                         /* Wait until the parent is ready with the setup, too... */
3016                         eventfd_read(parent_ready_fd, &x);
3017                         parent_ready_fd = safe_close(parent_ready_fd);
3018
3019                         if (arg_boot) {
3020                                 char **a;
3021                                 size_t l;
3022
3023                                 /* Automatically search for the init system */
3024
3025                                 l = 1 + argc - optind;
3026                                 a = newa(char*, l + 1);
3027                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3028
3029                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3030                                 execve(a[0], a, env_use);
3031
3032                                 a[0] = (char*) "/lib/systemd/systemd";
3033                                 execve(a[0], a, env_use);
3034
3035                                 a[0] = (char*) "/sbin/init";
3036                                 execve(a[0], a, env_use);
3037                         } else if (argc > optind)
3038                                 execvpe(argv[optind], argv + optind, env_use);
3039                         else {
3040                                 chdir(home ? home : "/root");
3041                                 execle("/bin/bash", "-bash", NULL, env_use);
3042                                 execle("/bin/sh", "-sh", NULL, env_use);
3043                         }
3044
3045                         log_error("execv() failed: %m");
3046
3047                 child_fail:
3048                         _exit(EXIT_FAILURE);
3049                 }
3050
3051                 fdset_free(fds);
3052                 fds = NULL;
3053
3054                 /* Wait until the child reported that it is ready with
3055                  * all it needs to do with privileges. After we got
3056                  * the notification we can make the process join its
3057                  * cgroup which might limit what it can do */
3058                 eventfd_read(child_ready_fd, &x);
3059
3060                 r = register_machine(pid);
3061                 if (r < 0)
3062                         goto finish;
3063
3064                 r = move_network_interfaces(pid);
3065                 if (r < 0)
3066                         goto finish;
3067
3068                 r = setup_veth(pid, veth_name);
3069                 if (r < 0)
3070                         goto finish;
3071
3072                 r = setup_bridge(veth_name);
3073                 if (r < 0)
3074                         goto finish;
3075
3076                 r = setup_macvlan(pid);
3077                 if (r < 0)
3078                         goto finish;
3079
3080                 /* Notify the child that the parent is ready with all
3081                  * its setup, and thtat the child can now hand over
3082                  * control to the code to run inside the container. */
3083                 eventfd_write(parent_ready_fd, 1);
3084
3085                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3086                 if (k < 0) {
3087                         r = EXIT_FAILURE;
3088                         break;
3089                 }
3090
3091                 if (!arg_quiet)
3092                         putc('\n', stdout);
3093
3094                 /* Kill if it is not dead yet anyway */
3095                 terminate_machine(pid);
3096
3097                 /* Redundant, but better safe than sorry */
3098                 kill(pid, SIGKILL);
3099
3100                 k = wait_for_terminate(pid, &status);
3101                 pid = 0;
3102
3103                 if (k < 0) {
3104                         r = EXIT_FAILURE;
3105                         break;
3106                 }
3107
3108                 if (status.si_code == CLD_EXITED) {
3109                         r = status.si_status;
3110                         if (status.si_status != 0) {
3111                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3112                                 break;
3113                         }
3114
3115                         if (!arg_quiet)
3116                                 log_debug("Container %s exited successfully.", arg_machine);
3117                         break;
3118                 } else if (status.si_code == CLD_KILLED &&
3119                            status.si_status == SIGINT) {
3120
3121                         if (!arg_quiet)
3122                                 log_info("Container %s has been shut down.", arg_machine);
3123                         r = 0;
3124                         break;
3125                 } else if (status.si_code == CLD_KILLED &&
3126                            status.si_status == SIGHUP) {
3127
3128                         if (!arg_quiet)
3129                                 log_info("Container %s is being rebooted.", arg_machine);
3130                         continue;
3131                 } else if (status.si_code == CLD_KILLED ||
3132                            status.si_code == CLD_DUMPED) {
3133
3134                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3135                         r = EXIT_FAILURE;
3136                         break;
3137                 } else {
3138                         log_error("Container %s failed due to unknown reason.", arg_machine);
3139                         r = EXIT_FAILURE;
3140                         break;
3141                 }
3142         }
3143
3144 finish:
3145         loop_remove(loop_nr, &image_fd);
3146
3147         if (pid > 0)
3148                 kill(pid, SIGKILL);
3149
3150         free(arg_directory);
3151         free(arg_machine);
3152         free(arg_user);
3153         strv_free(arg_setenv);
3154         strv_free(arg_network_interfaces);
3155         strv_free(arg_network_macvlan);
3156         strv_free(arg_bind);
3157         strv_free(arg_bind_ro);
3158
3159         return r;
3160 }