chiark / gitweb /
73158a051831edf74a5db86ee8027b471a5e28c8
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94
95 typedef enum LinkJournal {
96         LINK_NO,
97         LINK_AUTO,
98         LINK_HOST,
99         LINK_GUEST
100 } LinkJournal;
101
102 static char *arg_directory = NULL;
103 static char *arg_user = NULL;
104 static sd_id128_t arg_uuid = {};
105 static char *arg_machine = NULL;
106 static const char *arg_selinux_context = NULL;
107 static const char *arg_selinux_apifs_context = NULL;
108 static const char *arg_slice = NULL;
109 static bool arg_private_network = false;
110 static bool arg_read_only = false;
111 static bool arg_boot = false;
112 static LinkJournal arg_link_journal = LINK_AUTO;
113 static uint64_t arg_retain =
114         (1ULL << CAP_CHOWN) |
115         (1ULL << CAP_DAC_OVERRIDE) |
116         (1ULL << CAP_DAC_READ_SEARCH) |
117         (1ULL << CAP_FOWNER) |
118         (1ULL << CAP_FSETID) |
119         (1ULL << CAP_IPC_OWNER) |
120         (1ULL << CAP_KILL) |
121         (1ULL << CAP_LEASE) |
122         (1ULL << CAP_LINUX_IMMUTABLE) |
123         (1ULL << CAP_NET_BIND_SERVICE) |
124         (1ULL << CAP_NET_BROADCAST) |
125         (1ULL << CAP_NET_RAW) |
126         (1ULL << CAP_SETGID) |
127         (1ULL << CAP_SETFCAP) |
128         (1ULL << CAP_SETPCAP) |
129         (1ULL << CAP_SETUID) |
130         (1ULL << CAP_SYS_ADMIN) |
131         (1ULL << CAP_SYS_CHROOT) |
132         (1ULL << CAP_SYS_NICE) |
133         (1ULL << CAP_SYS_PTRACE) |
134         (1ULL << CAP_SYS_TTY_CONFIG) |
135         (1ULL << CAP_SYS_RESOURCE) |
136         (1ULL << CAP_SYS_BOOT) |
137         (1ULL << CAP_AUDIT_WRITE) |
138         (1ULL << CAP_AUDIT_CONTROL) |
139         (1ULL << CAP_MKNOD);
140 static char **arg_bind = NULL;
141 static char **arg_bind_ro = NULL;
142 static char **arg_setenv = NULL;
143 static bool arg_quiet = false;
144 static bool arg_share_system = false;
145 static bool arg_register = true;
146 static bool arg_keep_unit = false;
147 static char **arg_network_interfaces = NULL;
148 static char **arg_network_macvlan = NULL;
149 static bool arg_network_veth = false;
150 static const char *arg_network_bridge = NULL;
151 static unsigned long arg_personality = 0xffffffffLU;
152 static const char *arg_image = NULL;
153
154 static int help(void) {
155
156         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
157                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
158                "  -h --help                 Show this help\n"
159                "     --version              Print version string\n"
160                "  -q --quiet                Do not show status information\n"
161                "  -D --directory=PATH       Root directory for the container\n"
162                "  -i --image=PATH           File system device or image for the container\n"
163                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
164                "  -u --user=USER            Run the command under specified user or uid\n"
165                "  -M --machine=NAME         Set the machine name for the container\n"
166                "     --uuid=UUID            Set a specific machine UUID for the container\n"
167                "  -S --slice=SLICE          Place the container in the specified slice\n"
168                "     --private-network      Disable network in container\n"
169                "     --network-interface=INTERFACE\n"
170                "                            Assign an existing network interface to the\n"
171                "                            container\n"
172                "     --network-macvlan=INTERFACE\n"
173                "                            Create a macvlan network interface based on an\n"
174                "                            existing network interface to the container\n"
175                "     --network-veth         Add a virtual ethernet connection between host\n"
176                "                            and container\n"
177                "     --network-bridge=INTERFACE\n"
178                "                            Add a virtual ethernet connection between host\n"
179                "                            and container and add it to an existing bridge on\n"
180                "                            the host\n"
181                "  -Z --selinux-context=SECLABEL\n"
182                "                            Set the SELinux security context to be used by\n"
183                "                            processes in the container\n"
184                "  -L --selinux-apifs-context=SECLABEL\n"
185                "                            Set the SELinux security context to be used by\n"
186                "                            API/tmpfs file systems in the container\n"
187                "     --capability=CAP       In addition to the default, retain specified\n"
188                "                            capability\n"
189                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
190                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
191                "  -j                        Equivalent to --link-journal=host\n"
192                "     --read-only            Mount the root directory read-only\n"
193                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
194                "                            the container\n"
195                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
196                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
197                "     --share-system         Share system namespaces with host\n"
198                "     --register=BOOLEAN     Register container as machine\n"
199                "     --keep-unit            Do not register a scope for the machine, reuse\n"
200                "                            the service unit nspawn is running in\n",
201                program_invocation_short_name);
202
203         return 0;
204 }
205
206 static int parse_argv(int argc, char *argv[]) {
207
208         enum {
209                 ARG_VERSION = 0x100,
210                 ARG_PRIVATE_NETWORK,
211                 ARG_UUID,
212                 ARG_READ_ONLY,
213                 ARG_CAPABILITY,
214                 ARG_DROP_CAPABILITY,
215                 ARG_LINK_JOURNAL,
216                 ARG_BIND,
217                 ARG_BIND_RO,
218                 ARG_SETENV,
219                 ARG_SHARE_SYSTEM,
220                 ARG_REGISTER,
221                 ARG_KEEP_UNIT,
222                 ARG_NETWORK_INTERFACE,
223                 ARG_NETWORK_MACVLAN,
224                 ARG_NETWORK_VETH,
225                 ARG_NETWORK_BRIDGE,
226                 ARG_PERSONALITY,
227         };
228
229         static const struct option options[] = {
230                 { "help",                  no_argument,       NULL, 'h'                   },
231                 { "version",               no_argument,       NULL, ARG_VERSION           },
232                 { "directory",             required_argument, NULL, 'D'                   },
233                 { "user",                  required_argument, NULL, 'u'                   },
234                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
235                 { "boot",                  no_argument,       NULL, 'b'                   },
236                 { "uuid",                  required_argument, NULL, ARG_UUID              },
237                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
238                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
239                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
240                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
241                 { "bind",                  required_argument, NULL, ARG_BIND              },
242                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
243                 { "machine",               required_argument, NULL, 'M'                   },
244                 { "slice",                 required_argument, NULL, 'S'                   },
245                 { "setenv",                required_argument, NULL, ARG_SETENV            },
246                 { "selinux-context",       required_argument, NULL, 'Z'                   },
247                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
248                 { "quiet",                 no_argument,       NULL, 'q'                   },
249                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
250                 { "register",              required_argument, NULL, ARG_REGISTER          },
251                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
252                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
253                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
254                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
255                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
256                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
257                 { "image",                 required_argument, NULL, 'i'                   },
258                 {}
259         };
260
261         int c, r;
262         uint64_t plus = 0, minus = 0;
263
264         assert(argc >= 0);
265         assert(argv);
266
267         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
268
269                 switch (c) {
270
271                 case 'h':
272                         return help();
273
274                 case ARG_VERSION:
275                         puts(PACKAGE_STRING);
276                         puts(SYSTEMD_FEATURES);
277                         return 0;
278
279                 case 'D':
280                         free(arg_directory);
281                         arg_directory = canonicalize_file_name(optarg);
282                         if (!arg_directory) {
283                                 log_error("Invalid root directory: %m");
284                                 return -ENOMEM;
285                         }
286
287                         break;
288
289                 case 'i':
290                         arg_image = optarg;
291                         break;
292
293                 case 'u':
294                         free(arg_user);
295                         arg_user = strdup(optarg);
296                         if (!arg_user)
297                                 return log_oom();
298
299                         break;
300
301                 case ARG_NETWORK_BRIDGE:
302                         arg_network_bridge = optarg;
303
304                         /* fall through */
305
306                 case ARG_NETWORK_VETH:
307                         arg_network_veth = true;
308                         arg_private_network = true;
309                         break;
310
311                 case ARG_NETWORK_INTERFACE:
312                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
313                                 return log_oom();
314
315                         arg_private_network = true;
316                         break;
317
318                 case ARG_NETWORK_MACVLAN:
319                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
320                                 return log_oom();
321
322                         /* fall through */
323
324                 case ARG_PRIVATE_NETWORK:
325                         arg_private_network = true;
326                         break;
327
328                 case 'b':
329                         arg_boot = true;
330                         break;
331
332                 case ARG_UUID:
333                         r = sd_id128_from_string(optarg, &arg_uuid);
334                         if (r < 0) {
335                                 log_error("Invalid UUID: %s", optarg);
336                                 return r;
337                         }
338                         break;
339
340                 case 'S':
341                         arg_slice = optarg;
342                         break;
343
344                 case 'M':
345                         if (isempty(optarg)) {
346                                 free(arg_machine);
347                                 arg_machine = NULL;
348                         } else {
349
350                                 if (!hostname_is_valid(optarg)) {
351                                         log_error("Invalid machine name: %s", optarg);
352                                         return -EINVAL;
353                                 }
354
355                                 free(arg_machine);
356                                 arg_machine = strdup(optarg);
357                                 if (!arg_machine)
358                                         return log_oom();
359
360                                 break;
361                         }
362
363                 case 'Z':
364                         arg_selinux_context = optarg;
365                         break;
366
367                 case 'L':
368                         arg_selinux_apifs_context = optarg;
369                         break;
370
371                 case ARG_READ_ONLY:
372                         arg_read_only = true;
373                         break;
374
375                 case ARG_CAPABILITY:
376                 case ARG_DROP_CAPABILITY: {
377                         char *state, *word;
378                         size_t length;
379
380                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
381                                 _cleanup_free_ char *t;
382                                 cap_value_t cap;
383
384                                 t = strndup(word, length);
385                                 if (!t)
386                                         return log_oom();
387
388                                 if (streq(t, "all")) {
389                                         if (c == ARG_CAPABILITY)
390                                                 plus = (uint64_t) -1;
391                                         else
392                                                 minus = (uint64_t) -1;
393                                 } else {
394                                         if (cap_from_name(t, &cap) < 0) {
395                                                 log_error("Failed to parse capability %s.", t);
396                                                 return -EINVAL;
397                                         }
398
399                                         if (c == ARG_CAPABILITY)
400                                                 plus |= 1ULL << (uint64_t) cap;
401                                         else
402                                                 minus |= 1ULL << (uint64_t) cap;
403                                 }
404                         }
405
406                         break;
407                 }
408
409                 case 'j':
410                         arg_link_journal = LINK_GUEST;
411                         break;
412
413                 case ARG_LINK_JOURNAL:
414                         if (streq(optarg, "auto"))
415                                 arg_link_journal = LINK_AUTO;
416                         else if (streq(optarg, "no"))
417                                 arg_link_journal = LINK_NO;
418                         else if (streq(optarg, "guest"))
419                                 arg_link_journal = LINK_GUEST;
420                         else if (streq(optarg, "host"))
421                                 arg_link_journal = LINK_HOST;
422                         else {
423                                 log_error("Failed to parse link journal mode %s", optarg);
424                                 return -EINVAL;
425                         }
426
427                         break;
428
429                 case ARG_BIND:
430                 case ARG_BIND_RO: {
431                         _cleanup_free_ char *a = NULL, *b = NULL;
432                         char *e;
433                         char ***x;
434
435                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
436
437                         e = strchr(optarg, ':');
438                         if (e) {
439                                 a = strndup(optarg, e - optarg);
440                                 b = strdup(e + 1);
441                         } else {
442                                 a = strdup(optarg);
443                                 b = strdup(optarg);
444                         }
445
446                         if (!a || !b)
447                                 return log_oom();
448
449                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
450                                 log_error("Invalid bind mount specification: %s", optarg);
451                                 return -EINVAL;
452                         }
453
454                         r = strv_extend(x, a);
455                         if (r < 0)
456                                 return log_oom();
457
458                         r = strv_extend(x, b);
459                         if (r < 0)
460                                 return log_oom();
461
462                         break;
463                 }
464
465                 case ARG_SETENV: {
466                         char **n;
467
468                         if (!env_assignment_is_valid(optarg)) {
469                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
470                                 return -EINVAL;
471                         }
472
473                         n = strv_env_set(arg_setenv, optarg);
474                         if (!n)
475                                 return log_oom();
476
477                         strv_free(arg_setenv);
478                         arg_setenv = n;
479                         break;
480                 }
481
482                 case 'q':
483                         arg_quiet = true;
484                         break;
485
486                 case ARG_SHARE_SYSTEM:
487                         arg_share_system = true;
488                         break;
489
490                 case ARG_REGISTER:
491                         r = parse_boolean(optarg);
492                         if (r < 0) {
493                                 log_error("Failed to parse --register= argument: %s", optarg);
494                                 return r;
495                         }
496
497                         arg_register = r;
498                         break;
499
500                 case ARG_KEEP_UNIT:
501                         arg_keep_unit = true;
502                         break;
503
504                 case ARG_PERSONALITY:
505
506                         arg_personality = personality_from_string(optarg);
507                         if (arg_personality == 0xffffffffLU) {
508                                 log_error("Unknown or unsupported personality '%s'.", optarg);
509                                 return -EINVAL;
510                         }
511
512                         break;
513
514                 case '?':
515                         return -EINVAL;
516
517                 default:
518                         assert_not_reached("Unhandled option");
519                 }
520         }
521
522         if (arg_share_system)
523                 arg_register = false;
524
525         if (arg_boot && arg_share_system) {
526                 log_error("--boot and --share-system may not be combined.");
527                 return -EINVAL;
528         }
529
530         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
531                 log_error("--keep-unit may not be used when invoked from a user session.");
532                 return -EINVAL;
533         }
534
535         if (arg_directory && arg_image) {
536                 log_error("--directory= and --image= may not be combined.");
537                 return -EINVAL;
538         }
539
540         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
541
542         return 1;
543 }
544
545 static int mount_all(const char *dest) {
546
547         typedef struct MountPoint {
548                 const char *what;
549                 const char *where;
550                 const char *type;
551                 const char *options;
552                 unsigned long flags;
553                 bool fatal;
554         } MountPoint;
555
556         static const MountPoint mount_table[] = {
557                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
558                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
559                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
560                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
561                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
562                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
563                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
565 #ifdef HAVE_SELINUX
566                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
567                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
568 #endif
569         };
570
571         unsigned k;
572         int r = 0;
573
574         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
575                 _cleanup_free_ char *where = NULL;
576 #ifdef HAVE_SELINUX
577                 _cleanup_free_ char *options = NULL;
578 #endif
579                 const char *o;
580                 int t;
581
582                 where = strjoin(dest, "/", mount_table[k].where, NULL);
583                 if (!where)
584                         return log_oom();
585
586                 t = path_is_mount_point(where, true);
587                 if (t < 0) {
588                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
589
590                         if (r == 0)
591                                 r = t;
592
593                         continue;
594                 }
595
596                 /* Skip this entry if it is not a remount. */
597                 if (mount_table[k].what && t > 0)
598                         continue;
599
600                 mkdir_p(where, 0755);
601
602 #ifdef HAVE_SELINUX
603                 if (arg_selinux_apifs_context &&
604                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
605                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
606                         if (!options)
607                                 return log_oom();
608
609                         o = options;
610                 } else
611 #endif
612                         o = mount_table[k].options;
613
614
615                 if (mount(mount_table[k].what,
616                           where,
617                           mount_table[k].type,
618                           mount_table[k].flags,
619                           o) < 0 &&
620                     mount_table[k].fatal) {
621
622                         log_error("mount(%s) failed: %m", where);
623
624                         if (r == 0)
625                                 r = -errno;
626                 }
627         }
628
629         return r;
630 }
631
632 static int mount_binds(const char *dest, char **l, unsigned long flags) {
633         char **x, **y;
634
635         STRV_FOREACH_PAIR(x, y, l) {
636                 char *where;
637                 struct stat source_st, dest_st;
638                 int r;
639
640                 if (stat(*x, &source_st) < 0) {
641                         log_error("Failed to stat %s: %m", *x);
642                         return -errno;
643                 }
644
645                 where = strappenda(dest, *y);
646                 r = stat(where, &dest_st);
647                 if (r == 0) {
648                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
649                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
650                                                 *x, where);
651                                 return -EINVAL;
652                         }
653                 } else if (errno == ENOENT) {
654                         r = mkdir_parents_label(where, 0755);
655                         if (r < 0) {
656                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
657                                 return r;
658                         }
659                 } else {
660                         log_error("Failed to bind mount %s: %m", *x);
661                         return -errno;
662                 }
663                 /* Create the mount point, but be conservative -- refuse to create block
664                 * and char devices. */
665                 if (S_ISDIR(source_st.st_mode))
666                         mkdir_label(where, 0755);
667                 else if (S_ISFIFO(source_st.st_mode))
668                         mkfifo(where, 0644);
669                 else if (S_ISSOCK(source_st.st_mode))
670                         mknod(where, 0644 | S_IFSOCK, 0);
671                 else if (S_ISREG(source_st.st_mode))
672                         touch(where);
673                 else {
674                         log_error("Refusing to create mountpoint for file: %s", *x);
675                         return -ENOTSUP;
676                 }
677
678                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
679                         log_error("mount(%s) failed: %m", where);
680                         return -errno;
681                 }
682
683                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
684                         log_error("mount(%s) failed: %m", where);
685                         return -errno;
686                 }
687         }
688
689         return 0;
690 }
691
692 static int setup_timezone(const char *dest) {
693         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
694         char *z, *y;
695         int r;
696
697         assert(dest);
698
699         /* Fix the timezone, if possible */
700         r = readlink_malloc("/etc/localtime", &p);
701         if (r < 0) {
702                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
703                 return 0;
704         }
705
706         z = path_startswith(p, "../usr/share/zoneinfo/");
707         if (!z)
708                 z = path_startswith(p, "/usr/share/zoneinfo/");
709         if (!z) {
710                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
711                 return 0;
712         }
713
714         where = strappend(dest, "/etc/localtime");
715         if (!where)
716                 return log_oom();
717
718         r = readlink_malloc(where, &q);
719         if (r >= 0) {
720                 y = path_startswith(q, "../usr/share/zoneinfo/");
721                 if (!y)
722                         y = path_startswith(q, "/usr/share/zoneinfo/");
723
724
725                 /* Already pointing to the right place? Then do nothing .. */
726                 if (y && streq(y, z))
727                         return 0;
728         }
729
730         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
731         if (!check)
732                 return log_oom();
733
734         if (access(check, F_OK) < 0) {
735                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
736                 return 0;
737         }
738
739         what = strappend("../usr/share/zoneinfo/", z);
740         if (!what)
741                 return log_oom();
742
743         unlink(where);
744         if (symlink(what, where) < 0) {
745                 log_error("Failed to correct timezone of container: %m");
746                 return 0;
747         }
748
749         return 0;
750 }
751
752 static int setup_resolv_conf(const char *dest) {
753         char _cleanup_free_ *where = NULL;
754
755         assert(dest);
756
757         if (arg_private_network)
758                 return 0;
759
760         /* Fix resolv.conf, if possible */
761         where = strappend(dest, "/etc/resolv.conf");
762         if (!where)
763                 return log_oom();
764
765         /* We don't really care for the results of this really. If it
766          * fails, it fails, but meh... */
767         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
768
769         return 0;
770 }
771
772 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
773
774         snprintf(s, 37,
775                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
776                  SD_ID128_FORMAT_VAL(id));
777
778         return s;
779 }
780
781 static int setup_boot_id(const char *dest) {
782         _cleanup_free_ char *from = NULL, *to = NULL;
783         sd_id128_t rnd = {};
784         char as_uuid[37];
785         int r;
786
787         assert(dest);
788
789         if (arg_share_system)
790                 return 0;
791
792         /* Generate a new randomized boot ID, so that each boot-up of
793          * the container gets a new one */
794
795         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
796         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
797         if (!from || !to)
798                 return log_oom();
799
800         r = sd_id128_randomize(&rnd);
801         if (r < 0) {
802                 log_error("Failed to generate random boot id: %s", strerror(-r));
803                 return r;
804         }
805
806         id128_format_as_uuid(rnd, as_uuid);
807
808         r = write_string_file(from, as_uuid);
809         if (r < 0) {
810                 log_error("Failed to write boot id: %s", strerror(-r));
811                 return r;
812         }
813
814         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
815                 log_error("Failed to bind mount boot id: %m");
816                 r = -errno;
817         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
818                 log_warning("Failed to make boot id read-only: %m");
819
820         unlink(from);
821         return r;
822 }
823
824 static int copy_devnodes(const char *dest) {
825
826         static const char devnodes[] =
827                 "null\0"
828                 "zero\0"
829                 "full\0"
830                 "random\0"
831                 "urandom\0"
832                 "tty\0";
833
834         const char *d;
835         int r = 0;
836         _cleanup_umask_ mode_t u;
837
838         assert(dest);
839
840         u = umask(0000);
841
842         NULSTR_FOREACH(d, devnodes) {
843                 _cleanup_free_ char *from = NULL, *to = NULL;
844                 struct stat st;
845
846                 from = strappend("/dev/", d);
847                 to = strjoin(dest, "/dev/", d, NULL);
848                 if (!from || !to)
849                         return log_oom();
850
851                 if (stat(from, &st) < 0) {
852
853                         if (errno != ENOENT) {
854                                 log_error("Failed to stat %s: %m", from);
855                                 return -errno;
856                         }
857
858                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
859
860                         log_error("%s is not a char or block device, cannot copy", from);
861                         return -EIO;
862
863                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
864
865                         log_error("mknod(%s) failed: %m", dest);
866                         return  -errno;
867                 }
868         }
869
870         return r;
871 }
872
873 static int setup_ptmx(const char *dest) {
874         _cleanup_free_ char *p = NULL;
875
876         p = strappend(dest, "/dev/ptmx");
877         if (!p)
878                 return log_oom();
879
880         if (symlink("pts/ptmx", p) < 0) {
881                 log_error("Failed to create /dev/ptmx symlink: %m");
882                 return -errno;
883         }
884
885         return 0;
886 }
887
888 static int setup_dev_console(const char *dest, const char *console) {
889         _cleanup_umask_ mode_t u;
890         const char *to;
891         struct stat st;
892         int r;
893
894         assert(dest);
895         assert(console);
896
897         u = umask(0000);
898
899         if (stat("/dev/null", &st) < 0) {
900                 log_error("Failed to stat /dev/null: %m");
901                 return -errno;
902         }
903
904         r = chmod_and_chown(console, 0600, 0, 0);
905         if (r < 0) {
906                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
907                 return r;
908         }
909
910         /* We need to bind mount the right tty to /dev/console since
911          * ptys can only exist on pts file systems. To have something
912          * to bind mount things on we create a device node first, and
913          * use /dev/null for that since we the cgroups device policy
914          * allows us to create that freely, while we cannot create
915          * /dev/console. (Note that the major minor doesn't actually
916          * matter here, since we mount it over anyway). */
917
918         to = strappenda(dest, "/dev/console");
919         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
920                 log_error("mknod() for /dev/console failed: %m");
921                 return -errno;
922         }
923
924         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
925                 log_error("Bind mount for /dev/console failed: %m");
926                 return -errno;
927         }
928
929         return 0;
930 }
931
932 static int setup_kmsg(const char *dest, int kmsg_socket) {
933         _cleanup_free_ char *from = NULL, *to = NULL;
934         int r, fd, k;
935         _cleanup_umask_ mode_t u;
936         union {
937                 struct cmsghdr cmsghdr;
938                 uint8_t buf[CMSG_SPACE(sizeof(int))];
939         } control = {};
940         struct msghdr mh = {
941                 .msg_control = &control,
942                 .msg_controllen = sizeof(control),
943         };
944         struct cmsghdr *cmsg;
945
946         assert(dest);
947         assert(kmsg_socket >= 0);
948
949         u = umask(0000);
950
951         /* We create the kmsg FIFO as /dev/kmsg, but immediately
952          * delete it after bind mounting it to /proc/kmsg. While FIFOs
953          * on the reading side behave very similar to /proc/kmsg,
954          * their writing side behaves differently from /dev/kmsg in
955          * that writing blocks when nothing is reading. In order to
956          * avoid any problems with containers deadlocking due to this
957          * we simply make /dev/kmsg unavailable to the container. */
958         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
959             asprintf(&to, "%s/proc/kmsg", dest) < 0)
960                 return log_oom();
961
962         if (mkfifo(from, 0600) < 0) {
963                 log_error("mkfifo() for /dev/kmsg failed: %m");
964                 return -errno;
965         }
966
967         r = chmod_and_chown(from, 0600, 0, 0);
968         if (r < 0) {
969                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
970                 return r;
971         }
972
973         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
974                 log_error("Bind mount for /proc/kmsg failed: %m");
975                 return -errno;
976         }
977
978         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
979         if (fd < 0) {
980                 log_error("Failed to open fifo: %m");
981                 return -errno;
982         }
983
984         cmsg = CMSG_FIRSTHDR(&mh);
985         cmsg->cmsg_level = SOL_SOCKET;
986         cmsg->cmsg_type = SCM_RIGHTS;
987         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
988         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
989
990         mh.msg_controllen = cmsg->cmsg_len;
991
992         /* Store away the fd in the socket, so that it stays open as
993          * long as we run the child */
994         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
995         safe_close(fd);
996
997         if (k < 0) {
998                 log_error("Failed to send FIFO fd: %m");
999                 return -errno;
1000         }
1001
1002         /* And now make the FIFO unavailable as /dev/kmsg... */
1003         unlink(from);
1004         return 0;
1005 }
1006
1007 static int setup_hostname(void) {
1008
1009         if (arg_share_system)
1010                 return 0;
1011
1012         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1013                 return -errno;
1014
1015         return 0;
1016 }
1017
1018 static int setup_journal(const char *directory) {
1019         sd_id128_t machine_id, this_id;
1020         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1021         char *id;
1022         int r;
1023
1024         p = strappend(directory, "/etc/machine-id");
1025         if (!p)
1026                 return log_oom();
1027
1028         r = read_one_line_file(p, &b);
1029         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1030                 return 0;
1031         else if (r < 0) {
1032                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1033                 return r;
1034         }
1035
1036         id = strstrip(b);
1037         if (isempty(id) && arg_link_journal == LINK_AUTO)
1038                 return 0;
1039
1040         /* Verify validity */
1041         r = sd_id128_from_string(id, &machine_id);
1042         if (r < 0) {
1043                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1044                 return r;
1045         }
1046
1047         r = sd_id128_get_machine(&this_id);
1048         if (r < 0) {
1049                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1050                 return r;
1051         }
1052
1053         if (sd_id128_equal(machine_id, this_id)) {
1054                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1055                          "Host and machine ids are equal (%s): refusing to link journals", id);
1056                 if (arg_link_journal == LINK_AUTO)
1057                         return 0;
1058                 return
1059                         -EEXIST;
1060         }
1061
1062         if (arg_link_journal == LINK_NO)
1063                 return 0;
1064
1065         free(p);
1066         p = strappend("/var/log/journal/", id);
1067         q = strjoin(directory, "/var/log/journal/", id, NULL);
1068         if (!p || !q)
1069                 return log_oom();
1070
1071         if (path_is_mount_point(p, false) > 0) {
1072                 if (arg_link_journal != LINK_AUTO) {
1073                         log_error("%s: already a mount point, refusing to use for journal", p);
1074                         return -EEXIST;
1075                 }
1076
1077                 return 0;
1078         }
1079
1080         if (path_is_mount_point(q, false) > 0) {
1081                 if (arg_link_journal != LINK_AUTO) {
1082                         log_error("%s: already a mount point, refusing to use for journal", q);
1083                         return -EEXIST;
1084                 }
1085
1086                 return 0;
1087         }
1088
1089         r = readlink_and_make_absolute(p, &d);
1090         if (r >= 0) {
1091                 if ((arg_link_journal == LINK_GUEST ||
1092                      arg_link_journal == LINK_AUTO) &&
1093                     path_equal(d, q)) {
1094
1095                         r = mkdir_p(q, 0755);
1096                         if (r < 0)
1097                                 log_warning("failed to create directory %s: %m", q);
1098                         return 0;
1099                 }
1100
1101                 if (unlink(p) < 0) {
1102                         log_error("Failed to remove symlink %s: %m", p);
1103                         return -errno;
1104                 }
1105         } else if (r == -EINVAL) {
1106
1107                 if (arg_link_journal == LINK_GUEST &&
1108                     rmdir(p) < 0) {
1109
1110                         if (errno == ENOTDIR) {
1111                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1112                                 return r;
1113                         } else {
1114                                 log_error("Failed to remove %s: %m", p);
1115                                 return -errno;
1116                         }
1117                 }
1118         } else if (r != -ENOENT) {
1119                 log_error("readlink(%s) failed: %m", p);
1120                 return r;
1121         }
1122
1123         if (arg_link_journal == LINK_GUEST) {
1124
1125                 if (symlink(q, p) < 0) {
1126                         log_error("Failed to symlink %s to %s: %m", q, p);
1127                         return -errno;
1128                 }
1129
1130                 r = mkdir_p(q, 0755);
1131                 if (r < 0)
1132                         log_warning("failed to create directory %s: %m", q);
1133                 return 0;
1134         }
1135
1136         if (arg_link_journal == LINK_HOST) {
1137                 r = mkdir_p(p, 0755);
1138                 if (r < 0) {
1139                         log_error("Failed to create %s: %m", p);
1140                         return r;
1141                 }
1142
1143         } else if (access(p, F_OK) < 0)
1144                 return 0;
1145
1146         if (dir_is_empty(q) == 0)
1147                 log_warning("%s is not empty, proceeding anyway.", q);
1148
1149         r = mkdir_p(q, 0755);
1150         if (r < 0) {
1151                 log_error("Failed to create %s: %m", q);
1152                 return r;
1153         }
1154
1155         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1156                 log_error("Failed to bind mount journal from host into guest: %m");
1157                 return -errno;
1158         }
1159
1160         return 0;
1161 }
1162
1163 static int setup_kdbus(const char *dest, const char *path) {
1164         const char *p;
1165
1166         if (!path)
1167                 return 0;
1168
1169         p = strappenda(dest, "/dev/kdbus");
1170         if (mkdir(p, 0755) < 0) {
1171                 log_error("Failed to create kdbus path: %m");
1172                 return  -errno;
1173         }
1174
1175         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1176                 log_error("Failed to mount kdbus domain path: %m");
1177                 return -errno;
1178         }
1179
1180         return 0;
1181 }
1182
1183 static int drop_capabilities(void) {
1184         return capability_bounding_set_drop(~arg_retain, false);
1185 }
1186
1187 static int register_machine(pid_t pid) {
1188         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1189         _cleanup_bus_unref_ sd_bus *bus = NULL;
1190         int r;
1191
1192         if (!arg_register)
1193                 return 0;
1194
1195         r = sd_bus_default_system(&bus);
1196         if (r < 0) {
1197                 log_error("Failed to open system bus: %s", strerror(-r));
1198                 return r;
1199         }
1200
1201         if (arg_keep_unit) {
1202                 r = sd_bus_call_method(
1203                                 bus,
1204                                 "org.freedesktop.machine1",
1205                                 "/org/freedesktop/machine1",
1206                                 "org.freedesktop.machine1.Manager",
1207                                 "RegisterMachine",
1208                                 &error,
1209                                 NULL,
1210                                 "sayssus",
1211                                 arg_machine,
1212                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1213                                 "nspawn",
1214                                 "container",
1215                                 (uint32_t) pid,
1216                                 strempty(arg_directory));
1217         } else {
1218                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1219
1220                 r = sd_bus_message_new_method_call(
1221                                 bus,
1222                                 &m,
1223                                 "org.freedesktop.machine1",
1224                                 "/org/freedesktop/machine1",
1225                                 "org.freedesktop.machine1.Manager",
1226                                 "CreateMachine");
1227                 if (r < 0) {
1228                         log_error("Failed to create message: %s", strerror(-r));
1229                         return r;
1230                 }
1231
1232                 r = sd_bus_message_append(
1233                                 m,
1234                                 "sayssus",
1235                                 arg_machine,
1236                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1237                                 "nspawn",
1238                                 "container",
1239                                 (uint32_t) pid,
1240                                 strempty(arg_directory));
1241                 if (r < 0) {
1242                         log_error("Failed to append message arguments: %s", strerror(-r));
1243                         return r;
1244                 }
1245
1246                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1247                 if (r < 0) {
1248                         log_error("Failed to open container: %s", strerror(-r));
1249                         return r;
1250                 }
1251
1252                 if (!isempty(arg_slice)) {
1253                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1254                         if (r < 0) {
1255                                 log_error("Failed to append slice: %s", strerror(-r));
1256                                 return r;
1257                         }
1258                 }
1259
1260                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1261                 if (r < 0) {
1262                         log_error("Failed to add device policy: %s", strerror(-r));
1263                         return r;
1264                 }
1265
1266                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1267                                           /* Allow the container to
1268                                            * access and create the API
1269                                            * device nodes, so that
1270                                            * PrivateDevices= in the
1271                                            * container can work
1272                                            * fine */
1273                                           "/dev/null", "rwm",
1274                                           "/dev/zero", "rwm",
1275                                           "/dev/full", "rwm",
1276                                           "/dev/random", "rwm",
1277                                           "/dev/urandom", "rwm",
1278                                           "/dev/tty", "rwm",
1279                                           /* Allow the container
1280                                            * access to ptys. However,
1281                                            * do not permit the
1282                                            * container to ever create
1283                                            * these device nodes. */
1284                                           "/dev/pts/ptmx", "rw",
1285                                           "char-pts", "rw",
1286                                           /* Allow the container
1287                                            * access to all kdbus
1288                                            * devices. Again, the
1289                                            * container cannot create
1290                                            * these nodes, only use
1291                                            * them. We use a pretty
1292                                            * open match here, so that
1293                                            * the kernel API can still
1294                                            * change. */
1295                                           "char-kdbus", "rw",
1296                                           "char-kdbus/*", "rw");
1297                 if (r < 0) {
1298                         log_error("Failed to add device whitelist: %s", strerror(-r));
1299                         return r;
1300                 }
1301
1302                 r = sd_bus_message_close_container(m);
1303                 if (r < 0) {
1304                         log_error("Failed to close container: %s", strerror(-r));
1305                         return r;
1306                 }
1307
1308                 r = sd_bus_call(bus, m, 0, &error, NULL);
1309         }
1310
1311         if (r < 0) {
1312                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1313                 return r;
1314         }
1315
1316         return 0;
1317 }
1318
1319 static int terminate_machine(pid_t pid) {
1320         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1321         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1322         _cleanup_bus_unref_ sd_bus *bus = NULL;
1323         const char *path;
1324         int r;
1325
1326         if (!arg_register)
1327                 return 0;
1328
1329         r = sd_bus_default_system(&bus);
1330         if (r < 0) {
1331                 log_error("Failed to open system bus: %s", strerror(-r));
1332                 return r;
1333         }
1334
1335         r = sd_bus_call_method(
1336                         bus,
1337                         "org.freedesktop.machine1",
1338                         "/org/freedesktop/machine1",
1339                         "org.freedesktop.machine1.Manager",
1340                         "GetMachineByPID",
1341                         &error,
1342                         &reply,
1343                         "u",
1344                         (uint32_t) pid);
1345         if (r < 0) {
1346                 /* Note that the machine might already have been
1347                  * cleaned up automatically, hence don't consider it a
1348                  * failure if we cannot get the machine object. */
1349                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1350                 return 0;
1351         }
1352
1353         r = sd_bus_message_read(reply, "o", &path);
1354         if (r < 0)
1355                 return bus_log_parse_error(r);
1356
1357         r = sd_bus_call_method(
1358                         bus,
1359                         "org.freedesktop.machine1",
1360                         path,
1361                         "org.freedesktop.machine1.Machine",
1362                         "Terminate",
1363                         &error,
1364                         NULL,
1365                         NULL);
1366         if (r < 0) {
1367                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1368                 return 0;
1369         }
1370
1371         return 0;
1372 }
1373
1374 static int reset_audit_loginuid(void) {
1375         _cleanup_free_ char *p = NULL;
1376         int r;
1377
1378         if (arg_share_system)
1379                 return 0;
1380
1381         r = read_one_line_file("/proc/self/loginuid", &p);
1382         if (r == -ENOENT)
1383                 return 0;
1384         if (r < 0) {
1385                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1386                 return r;
1387         }
1388
1389         /* Already reset? */
1390         if (streq(p, "4294967295"))
1391                 return 0;
1392
1393         r = write_string_file("/proc/self/loginuid", "4294967295");
1394         if (r < 0) {
1395                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1396                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1397                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1398                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1399                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1400
1401                 sleep(5);
1402         }
1403
1404         return 0;
1405 }
1406
1407 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1408
1409 static int get_mac(struct ether_addr *mac) {
1410         int r;
1411
1412         uint8_t result[8];
1413         size_t l, sz;
1414         uint8_t *v;
1415
1416         l = strlen(arg_machine);
1417         sz = sizeof(sd_id128_t) + l;
1418         v = alloca(sz);
1419
1420         /* fetch some persistent data unique to the host */
1421         r = sd_id128_get_machine((sd_id128_t*) v);
1422         if (r < 0)
1423                 return r;
1424
1425         /* combine with some data unique (on this host) to this
1426          * container instance */
1427         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1428
1429         /* Let's hash the host machine ID plus the container name. We
1430          * use a fixed, but originally randomly created hash key here. */
1431         siphash24(result, v, sz, HASH_KEY.bytes);
1432
1433         assert_cc(ETH_ALEN <= sizeof(result));
1434         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1435
1436         /* see eth_random_addr in the kernel */
1437         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1438         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1439
1440         return 0;
1441 }
1442
1443 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1444         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1445         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1446         struct ether_addr mac;
1447         int r;
1448
1449         if (!arg_private_network)
1450                 return 0;
1451
1452         if (!arg_network_veth)
1453                 return 0;
1454
1455         /* Use two different interface name prefixes depending whether
1456          * we are in bridge mode or not. */
1457         if (arg_network_bridge)
1458                 memcpy(iface_name, "vb-", 3);
1459         else
1460                 memcpy(iface_name, "ve-", 3);
1461         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1462
1463         r = get_mac(&mac);
1464         if (r < 0) {
1465                 log_error("Failed to generate predictable MAC address for host0");
1466                 return r;
1467         }
1468
1469         r = sd_rtnl_open(&rtnl, 0);
1470         if (r < 0) {
1471                 log_error("Failed to connect to netlink: %s", strerror(-r));
1472                 return r;
1473         }
1474
1475         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1476         if (r < 0) {
1477                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1478                 return r;
1479         }
1480
1481         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1482         if (r < 0) {
1483                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1484                 return r;
1485         }
1486
1487         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1488         if (r < 0) {
1489                 log_error("Failed to open netlink container: %s", strerror(-r));
1490                 return r;
1491         }
1492
1493         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1494         if (r < 0) {
1495                 log_error("Failed to open netlink container: %s", strerror(-r));
1496                 return r;
1497         }
1498
1499         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1500         if (r < 0) {
1501                 log_error("Failed to open netlink container: %s", strerror(-r));
1502                 return r;
1503         }
1504
1505         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1506         if (r < 0) {
1507                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1508                 return r;
1509         }
1510
1511         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1512         if (r < 0) {
1513                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1514                 return r;
1515         }
1516
1517         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1518         if (r < 0) {
1519                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1520                 return r;
1521         }
1522
1523         r = sd_rtnl_message_close_container(m);
1524         if (r < 0) {
1525                 log_error("Failed to close netlink container: %s", strerror(-r));
1526                 return r;
1527         }
1528
1529         r = sd_rtnl_message_close_container(m);
1530         if (r < 0) {
1531                 log_error("Failed to close netlink container: %s", strerror(-r));
1532                 return r;
1533         }
1534
1535         r = sd_rtnl_message_close_container(m);
1536         if (r < 0) {
1537                 log_error("Failed to close netlink container: %s", strerror(-r));
1538                 return r;
1539         }
1540
1541         r = sd_rtnl_call(rtnl, m, 0, NULL);
1542         if (r < 0) {
1543                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1544                 return r;
1545         }
1546
1547         return 0;
1548 }
1549
1550 static int setup_bridge(const char veth_name[]) {
1551         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1552         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1553         int r, bridge;
1554
1555         if (!arg_private_network)
1556                 return 0;
1557
1558         if (!arg_network_veth)
1559                 return 0;
1560
1561         if (!arg_network_bridge)
1562                 return 0;
1563
1564         bridge = (int) if_nametoindex(arg_network_bridge);
1565         if (bridge <= 0) {
1566                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1567                 return -errno;
1568         }
1569
1570         r = sd_rtnl_open(&rtnl, 0);
1571         if (r < 0) {
1572                 log_error("Failed to connect to netlink: %s", strerror(-r));
1573                 return r;
1574         }
1575
1576         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1577         if (r < 0) {
1578                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1579                 return r;
1580         }
1581
1582         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1583         if (r < 0) {
1584                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1585                 return r;
1586         }
1587
1588         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1589         if (r < 0) {
1590                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1591                 return r;
1592         }
1593
1594         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1595         if (r < 0) {
1596                 log_error("Failed to add netlink master field: %s", strerror(-r));
1597                 return r;
1598         }
1599
1600         r = sd_rtnl_call(rtnl, m, 0, NULL);
1601         if (r < 0) {
1602                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1603                 return r;
1604         }
1605
1606         return 0;
1607 }
1608
1609 static int parse_interface(struct udev *udev, const char *name) {
1610         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1611         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1612         int ifi;
1613
1614         ifi = (int) if_nametoindex(name);
1615         if (ifi <= 0) {
1616                 log_error("Failed to resolve interface %s: %m", name);
1617                 return -errno;
1618         }
1619
1620         sprintf(ifi_str, "n%i", ifi);
1621         d = udev_device_new_from_device_id(udev, ifi_str);
1622         if (!d) {
1623                 log_error("Failed to get udev device for interface %s: %m", name);
1624                 return -errno;
1625         }
1626
1627         if (udev_device_get_is_initialized(d) <= 0) {
1628                 log_error("Network interface %s is not initialized yet.", name);
1629                 return -EBUSY;
1630         }
1631
1632         return ifi;
1633 }
1634
1635 static int move_network_interfaces(pid_t pid) {
1636         _cleanup_udev_unref_ struct udev *udev = NULL;
1637         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1638         char **i;
1639         int r;
1640
1641         if (!arg_private_network)
1642                 return 0;
1643
1644         if (strv_isempty(arg_network_interfaces))
1645                 return 0;
1646
1647         r = sd_rtnl_open(&rtnl, 0);
1648         if (r < 0) {
1649                 log_error("Failed to connect to netlink: %s", strerror(-r));
1650                 return r;
1651         }
1652
1653         udev = udev_new();
1654         if (!udev) {
1655                 log_error("Failed to connect to udev.");
1656                 return -ENOMEM;
1657         }
1658
1659         STRV_FOREACH(i, arg_network_interfaces) {
1660                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1661                 int ifi;
1662
1663                 ifi = parse_interface(udev, *i);
1664                 if (ifi < 0)
1665                         return ifi;
1666
1667                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1668                 if (r < 0) {
1669                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1670                         return r;
1671                 }
1672
1673                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1674                 if (r < 0) {
1675                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1676                         return r;
1677                 }
1678
1679                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1680                 if (r < 0) {
1681                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1682                         return r;
1683                 }
1684         }
1685
1686         return 0;
1687 }
1688
1689 static int setup_macvlan(pid_t pid) {
1690         _cleanup_udev_unref_ struct udev *udev = NULL;
1691         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1692         char **i;
1693         int r;
1694
1695         if (!arg_private_network)
1696                 return 0;
1697
1698         if (strv_isempty(arg_network_macvlan))
1699                 return 0;
1700
1701         r = sd_rtnl_open(&rtnl, 0);
1702         if (r < 0) {
1703                 log_error("Failed to connect to netlink: %s", strerror(-r));
1704                 return r;
1705         }
1706
1707         udev = udev_new();
1708         if (!udev) {
1709                 log_error("Failed to connect to udev.");
1710                 return -ENOMEM;
1711         }
1712
1713         STRV_FOREACH(i, arg_network_macvlan) {
1714                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1715                 _cleanup_free_ char *n = NULL;
1716                 int ifi;
1717
1718                 ifi = parse_interface(udev, *i);
1719                 if (ifi < 0)
1720                         return ifi;
1721
1722                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1723                 if (r < 0) {
1724                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1725                         return r;
1726                 }
1727
1728                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1729                 if (r < 0) {
1730                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1731                         return r;
1732                 }
1733
1734                 n = strappend("mv-", *i);
1735                 if (!n)
1736                         return log_oom();
1737
1738                 strshorten(n, IFNAMSIZ-1);
1739
1740                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1741                 if (r < 0) {
1742                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1743                         return r;
1744                 }
1745
1746                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1747                 if (r < 0) {
1748                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1749                         return r;
1750                 }
1751
1752                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1753                 if (r < 0) {
1754                         log_error("Failed to open netlink container: %s", strerror(-r));
1755                         return r;
1756                 }
1757
1758                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1759                 if (r < 0) {
1760                         log_error("Failed to open netlink container: %s", strerror(-r));
1761                         return r;
1762                 }
1763
1764                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1765                 if (r < 0) {
1766                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1767                         return r;
1768                 }
1769
1770                 r = sd_rtnl_message_close_container(m);
1771                 if (r < 0) {
1772                         log_error("Failed to close netlink container: %s", strerror(-r));
1773                         return r;
1774                 }
1775
1776                 r = sd_rtnl_message_close_container(m);
1777                 if (r < 0) {
1778                         log_error("Failed to close netlink container: %s", strerror(-r));
1779                         return r;
1780                 }
1781
1782                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1783                 if (r < 0) {
1784                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1785                         return r;
1786                 }
1787         }
1788
1789         return 0;
1790 }
1791
1792 static int audit_still_doesnt_work_in_containers(void) {
1793
1794 #ifdef HAVE_SECCOMP
1795         scmp_filter_ctx seccomp;
1796         int r;
1797
1798         /*
1799            Audit is broken in containers, much of the userspace audit
1800            hookup will fail if running inside a container. We don't
1801            care and just turn off creation of audit sockets.
1802
1803            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1804            with EAFNOSUPPORT which audit userspace uses as indication
1805            that audit is disabled in the kernel.
1806          */
1807
1808         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1809         if (!seccomp)
1810                 return log_oom();
1811
1812         r = seccomp_add_secondary_archs(seccomp);
1813         if (r < 0) {
1814                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1815                 goto finish;
1816         }
1817
1818         r = seccomp_rule_add(
1819                         seccomp,
1820                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1821                         SCMP_SYS(socket),
1822                         2,
1823                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1824                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1825         if (r < 0) {
1826                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1827                 goto finish;
1828         }
1829
1830         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1831         if (r < 0) {
1832                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1833                 goto finish;
1834         }
1835
1836         r = seccomp_load(seccomp);
1837         if (r < 0)
1838                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1839
1840 finish:
1841         seccomp_release(seccomp);
1842         return r;
1843 #else
1844         return 0;
1845 #endif
1846
1847 }
1848
1849 static int setup_image(char **device_path, int *loop_nr) {
1850         struct loop_info64 info = {
1851                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1852         };
1853         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1854         _cleanup_free_ char* loopdev = NULL;
1855         struct stat st;
1856         int r, nr;
1857
1858         assert(device_path);
1859         assert(loop_nr);
1860
1861         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1862         if (fd < 0) {
1863                 log_error("Failed to open %s: %m", arg_image);
1864                 return -errno;
1865         }
1866
1867         if (fstat(fd, &st) < 0) {
1868                 log_error("Failed to stat %s: %m", arg_image);
1869                 return -errno;
1870         }
1871
1872         if (S_ISBLK(st.st_mode)) {
1873                 char *p;
1874
1875                 p = strdup(arg_image);
1876                 if (!p)
1877                         return log_oom();
1878
1879                 *device_path = p;
1880
1881                 *loop_nr = -1;
1882
1883                 r = fd;
1884                 fd = -1;
1885
1886                 return r;
1887         }
1888
1889         if (!S_ISREG(st.st_mode)) {
1890                 log_error("%s is not a regular file or block device: %m", arg_image);
1891                 return -EINVAL;
1892         }
1893
1894         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1895         if (control < 0) {
1896                 log_error("Failed to open /dev/loop-control: %m");
1897                 return -errno;
1898         }
1899
1900         nr = ioctl(control, LOOP_CTL_GET_FREE);
1901         if (nr < 0) {
1902                 log_error("Failed to allocate loop device: %m");
1903                 return -errno;
1904         }
1905
1906         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1907                 return log_oom();
1908
1909         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1910         if (loop < 0) {
1911                 log_error("Failed to open loop device %s: %m", loopdev);
1912                 return -errno;
1913         }
1914
1915         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1916                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1917                 return -errno;
1918         }
1919
1920         if (arg_read_only)
1921                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1922
1923         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1924                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1925                 return -errno;
1926         }
1927
1928         *device_path = loopdev;
1929         loopdev = NULL;
1930
1931         *loop_nr = nr;
1932
1933         r = loop;
1934         loop = -1;
1935
1936         return r;
1937 }
1938
1939 static int dissect_image(
1940                 int fd,
1941                 char **root_device, bool *root_device_rw,
1942                 char **home_device, bool *home_device_rw,
1943                 char **srv_device, bool *srv_device_rw,
1944                 bool *secondary) {
1945
1946 #ifdef HAVE_BLKID
1947         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1948         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1949         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1950         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1951         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1952         _cleanup_udev_unref_ struct udev *udev = NULL;
1953         struct udev_list_entry *first, *item;
1954         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1955         const char *pttype = NULL;
1956         blkid_partlist pl;
1957         struct stat st;
1958         int r;
1959
1960         assert(fd >= 0);
1961         assert(root_device);
1962         assert(home_device);
1963         assert(srv_device);
1964         assert(secondary);
1965
1966         b = blkid_new_probe();
1967         if (!b)
1968                 return log_oom();
1969
1970         errno = 0;
1971         r = blkid_probe_set_device(b, fd, 0, 0);
1972         if (r != 0) {
1973                 if (errno == 0)
1974                         return log_oom();
1975
1976                 log_error("Failed to set device on blkid probe: %m");
1977                 return -errno;
1978         }
1979
1980         blkid_probe_enable_partitions(b, 1);
1981         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1982
1983         errno = 0;
1984         r = blkid_do_safeprobe(b);
1985         if (r == -2 || r == 1) {
1986                 log_error("Failed to identify any partition table on %s.\n"
1987                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1988                 return -EINVAL;
1989         } else if (r != 0) {
1990                 if (errno == 0)
1991                         errno = EIO;
1992                 log_error("Failed to probe: %m");
1993                 return -errno;
1994         }
1995
1996         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1997         if (!streq_ptr(pttype, "gpt")) {
1998                 log_error("Image %s does not carry a GUID Partition Table.\n"
1999                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2000                 return -EINVAL;
2001         }
2002
2003         errno = 0;
2004         pl = blkid_probe_get_partitions(b);
2005         if (!pl) {
2006                 if (errno == 0)
2007                         return log_oom();
2008
2009                 log_error("Failed to list partitions of %s", arg_image);
2010                 return -errno;
2011         }
2012
2013         udev = udev_new();
2014         if (!udev)
2015                 return log_oom();
2016
2017         if (fstat(fd, &st) < 0) {
2018                 log_error("Failed to stat block device: %m");
2019                 return -errno;
2020         }
2021
2022         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2023         if (!d)
2024                 return log_oom();
2025
2026         e = udev_enumerate_new(udev);
2027         if (!e)
2028                 return log_oom();
2029
2030         r = udev_enumerate_add_match_parent(e, d);
2031         if (r < 0)
2032                 return log_oom();
2033
2034         r = udev_enumerate_scan_devices(e);
2035         if (r < 0) {
2036                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2037                 return r;
2038         }
2039
2040         first = udev_enumerate_get_list_entry(e);
2041         udev_list_entry_foreach(item, first) {
2042                 _cleanup_udev_device_unref_ struct udev_device *q;
2043                 const char *stype, *node;
2044                 unsigned long long flags;
2045                 sd_id128_t type_id;
2046                 blkid_partition pp;
2047                 dev_t qn;
2048                 int nr;
2049
2050                 errno = 0;
2051                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2052                 if (!q) {
2053                         if (!errno)
2054                                 errno = ENOMEM;
2055
2056                         log_error("Failed to get partition device of %s: %m", arg_image);
2057                         return -errno;
2058                 }
2059
2060                 qn = udev_device_get_devnum(q);
2061                 if (major(qn) == 0)
2062                         continue;
2063
2064                 if (st.st_rdev == qn)
2065                         continue;
2066
2067                 node = udev_device_get_devnode(q);
2068                 if (!node)
2069                         continue;
2070
2071                 pp = blkid_partlist_devno_to_partition(pl, qn);
2072                 if (!pp)
2073                         continue;
2074
2075                 flags = blkid_partition_get_flags(pp);
2076                 if (flags & GPT_FLAG_NO_AUTO)
2077                         continue;
2078
2079                 nr = blkid_partition_get_partno(pp);
2080                 if (nr < 0)
2081                         continue;
2082
2083                 stype = blkid_partition_get_type_string(pp);
2084                 if (!stype)
2085                         continue;
2086
2087                 if (sd_id128_from_string(stype, &type_id) < 0)
2088                         continue;
2089
2090                 if (sd_id128_equal(type_id, GPT_HOME)) {
2091
2092                         if (home && nr >= home_nr)
2093                                 continue;
2094
2095                         home_nr = nr;
2096                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2097
2098                         free(home);
2099                         home = strdup(node);
2100                         if (!home)
2101                                 return log_oom();
2102                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2103
2104                         if (srv && nr >= srv_nr)
2105                                 continue;
2106
2107                         srv_nr = nr;
2108                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2109
2110                         free(srv);
2111                         srv = strdup(node);
2112                         if (!srv)
2113                                 return log_oom();
2114                 }
2115 #ifdef GPT_ROOT_NATIVE
2116                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2117
2118                         if (root && nr >= root_nr)
2119                                 continue;
2120
2121                         root_nr = nr;
2122                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2123
2124                         free(root);
2125                         root = strdup(node);
2126                         if (!root)
2127                                 return log_oom();
2128                 }
2129 #endif
2130 #ifdef GPT_ROOT_SECONDARY
2131                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2132
2133                         if (secondary_root && nr >= secondary_root_nr)
2134                                 continue;
2135
2136                         secondary_root_nr = nr;
2137                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2138
2139
2140                         free(secondary_root);
2141                         secondary_root = strdup(node);
2142                         if (!secondary_root)
2143                                 return log_oom();
2144                 }
2145 #endif
2146         }
2147
2148         if (!root && !secondary_root) {
2149                 log_error("Failed to identify root partition in disk image %s.\n"
2150                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2151                 return -EINVAL;
2152         }
2153
2154         if (root) {
2155                 *root_device = root;
2156                 root = NULL;
2157
2158                 *root_device_rw = root_rw;
2159                 *secondary = false;
2160         } else if (secondary_root) {
2161                 *root_device = secondary_root;
2162                 secondary_root = NULL;
2163
2164                 *root_device_rw = secondary_root_rw;
2165                 *secondary = true;
2166         }
2167
2168         if (home) {
2169                 *home_device = home;
2170                 home = NULL;
2171
2172                 *home_device_rw = home_rw;
2173         }
2174
2175         if (srv) {
2176                 *srv_device = srv;
2177                 srv = NULL;
2178
2179                 *srv_device_rw = srv_rw;
2180         }
2181
2182         return 0;
2183 #else
2184         log_error("--image= is not supported, compiled without blkid support.");
2185         return -ENOTSUP;
2186 #endif
2187 }
2188
2189 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2190 #ifdef HAVE_BLKID
2191         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2192         const char *fstype, *p;
2193         int r;
2194
2195         assert(what);
2196         assert(where);
2197
2198         if (arg_read_only)
2199                 rw = false;
2200
2201         if (directory)
2202                 p = strappenda(where, directory);
2203         else
2204                 p = where;
2205
2206         errno = 0;
2207         b = blkid_new_probe_from_filename(what);
2208         if (!b) {
2209                 if (errno == 0)
2210                         return log_oom();
2211                 log_error("Failed to allocate prober for %s: %m", what);
2212                 return -errno;
2213         }
2214
2215         blkid_probe_enable_superblocks(b, 1);
2216         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2217
2218         errno = 0;
2219         r = blkid_do_safeprobe(b);
2220         if (r == -1 || r == 1) {
2221                 log_error("Cannot determine file system type of %s", what);
2222                 return -EINVAL;
2223         } else if (r != 0) {
2224                 if (errno == 0)
2225                         errno = EIO;
2226                 log_error("Failed to probe %s: %m", what);
2227                 return -errno;
2228         }
2229
2230         errno = 0;
2231         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2232                 if (errno == 0)
2233                         errno = EINVAL;
2234                 log_error("Failed to determine file system type of %s", what);
2235                 return -errno;
2236         }
2237
2238         if (streq(fstype, "crypto_LUKS")) {
2239                 log_error("nspawn currently does not support LUKS disk images.");
2240                 return -ENOTSUP;
2241         }
2242
2243         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2244                 log_error("Failed to mount %s: %m", what);
2245                 return -errno;
2246         }
2247
2248         return 0;
2249 #else
2250         log_error("--image= is not supported, compiled without blkid support.");
2251         return -ENOTSUP;
2252 #endif
2253 }
2254
2255 static int mount_devices(
2256                 const char *where,
2257                 const char *root_device, bool root_device_rw,
2258                 const char *home_device, bool home_device_rw,
2259                 const char *srv_device, bool srv_device_rw) {
2260         int r;
2261
2262         assert(where);
2263
2264         if (root_device) {
2265                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2266                 if (r < 0) {
2267                         log_error("Failed to mount root directory: %s", strerror(-r));
2268                         return r;
2269                 }
2270         }
2271
2272         if (home_device) {
2273                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2274                 if (r < 0) {
2275                         log_error("Failed to mount home directory: %s", strerror(-r));
2276                         return r;
2277                 }
2278         }
2279
2280         if (srv_device) {
2281                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2282                 if (r < 0) {
2283                         log_error("Failed to mount server data directory: %s", strerror(-r));
2284                         return r;
2285                 }
2286         }
2287
2288         return 0;
2289 }
2290
2291 static void loop_remove(int nr, int *image_fd) {
2292         _cleanup_close_ int control = -1;
2293
2294         if (nr < 0)
2295                 return;
2296
2297         if (image_fd && *image_fd >= 0) {
2298                 ioctl(*image_fd, LOOP_CLR_FD);
2299                 *image_fd = safe_close(*image_fd);
2300         }
2301
2302         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2303         if (control < 0)
2304                 return;
2305
2306         ioctl(control, LOOP_CTL_REMOVE, nr);
2307 }
2308
2309 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2310         int pipe_fds[2];
2311         pid_t pid;
2312
2313         assert(database);
2314         assert(key);
2315         assert(rpid);
2316
2317         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2318                 log_error("Failed to allocate pipe: %m");
2319                 return -errno;
2320         }
2321
2322         pid = fork();
2323         if (pid < 0) {
2324                 log_error("Failed to fork getent child: %m");
2325                 return -errno;
2326         } else if (pid == 0) {
2327                 int nullfd;
2328                 char *empty_env = NULL;
2329
2330                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2331                         _exit(EXIT_FAILURE);
2332
2333                 if (pipe_fds[0] > 2)
2334                         safe_close(pipe_fds[0]);
2335                 if (pipe_fds[1] > 2)
2336                         safe_close(pipe_fds[1]);
2337
2338                 nullfd = open("/dev/null", O_RDWR);
2339                 if (nullfd < 0)
2340                         _exit(EXIT_FAILURE);
2341
2342                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2343                         _exit(EXIT_FAILURE);
2344
2345                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2346                         _exit(EXIT_FAILURE);
2347
2348                 if (nullfd > 2)
2349                         safe_close(nullfd);
2350
2351                 reset_all_signal_handlers();
2352                 close_all_fds(NULL, 0);
2353
2354                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2355                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2356                 _exit(EXIT_FAILURE);
2357         }
2358
2359         pipe_fds[1] = safe_close(pipe_fds[1]);
2360
2361         *rpid = pid;
2362
2363         return pipe_fds[0];
2364 }
2365
2366 static int change_uid_gid(char **_home) {
2367         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2368         _cleanup_free_ uid_t *uids = NULL;
2369         _cleanup_free_ char *home = NULL;
2370         _cleanup_fclose_ FILE *f = NULL;
2371         _cleanup_close_ int fd = -1;
2372         unsigned n_uids = 0;
2373         size_t sz = 0, l;
2374         uid_t uid;
2375         gid_t gid;
2376         pid_t pid;
2377         int r;
2378
2379         assert(_home);
2380
2381         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2382                 /* Reset everything fully to 0, just in case */
2383
2384                 if (setgroups(0, NULL) < 0) {
2385                         log_error("setgroups() failed: %m");
2386                         return -errno;
2387                 }
2388
2389                 if (setresgid(0, 0, 0) < 0) {
2390                         log_error("setregid() failed: %m");
2391                         return -errno;
2392                 }
2393
2394                 if (setresuid(0, 0, 0) < 0) {
2395                         log_error("setreuid() failed: %m");
2396                         return -errno;
2397                 }
2398
2399                 *_home = NULL;
2400                 return 0;
2401         }
2402
2403         /* First, get user credentials */
2404         fd = spawn_getent("passwd", arg_user, &pid);
2405         if (fd < 0)
2406                 return fd;
2407
2408         f = fdopen(fd, "r");
2409         if (!f)
2410                 return log_oom();
2411         fd = -1;
2412
2413         if (!fgets(line, sizeof(line), f)) {
2414
2415                 if (!ferror(f)) {
2416                         log_error("Failed to resolve user %s.", arg_user);
2417                         return -ESRCH;
2418                 }
2419
2420                 log_error("Failed to read from getent: %m");
2421                 return -errno;
2422         }
2423
2424         truncate_nl(line);
2425
2426         wait_for_terminate_and_warn("getent passwd", pid);
2427
2428         x = strchr(line, ':');
2429         if (!x) {
2430                 log_error("/etc/passwd entry has invalid user field.");
2431                 return -EIO;
2432         }
2433
2434         u = strchr(x+1, ':');
2435         if (!u) {
2436                 log_error("/etc/passwd entry has invalid password field.");
2437                 return -EIO;
2438         }
2439
2440         u++;
2441         g = strchr(u, ':');
2442         if (!g) {
2443                 log_error("/etc/passwd entry has invalid UID field.");
2444                 return -EIO;
2445         }
2446
2447         *g = 0;
2448         g++;
2449         x = strchr(g, ':');
2450         if (!x) {
2451                 log_error("/etc/passwd entry has invalid GID field.");
2452                 return -EIO;
2453         }
2454
2455         *x = 0;
2456         h = strchr(x+1, ':');
2457         if (!h) {
2458                 log_error("/etc/passwd entry has invalid GECOS field.");
2459                 return -EIO;
2460         }
2461
2462         h++;
2463         x = strchr(h, ':');
2464         if (!x) {
2465                 log_error("/etc/passwd entry has invalid home directory field.");
2466                 return -EIO;
2467         }
2468
2469         *x = 0;
2470
2471         r = parse_uid(u, &uid);
2472         if (r < 0) {
2473                 log_error("Failed to parse UID of user.");
2474                 return -EIO;
2475         }
2476
2477         r = parse_gid(g, &gid);
2478         if (r < 0) {
2479                 log_error("Failed to parse GID of user.");
2480                 return -EIO;
2481         }
2482
2483         home = strdup(h);
2484         if (!home)
2485                 return log_oom();
2486
2487         /* Second, get group memberships */
2488         fd = spawn_getent("initgroups", arg_user, &pid);
2489         if (fd < 0)
2490                 return fd;
2491
2492         fclose(f);
2493         f = fdopen(fd, "r");
2494         if (!f)
2495                 return log_oom();
2496         fd = -1;
2497
2498         if (!fgets(line, sizeof(line), f)) {
2499                 if (!ferror(f)) {
2500                         log_error("Failed to resolve user %s.", arg_user);
2501                         return -ESRCH;
2502                 }
2503
2504                 log_error("Failed to read from getent: %m");
2505                 return -errno;
2506         }
2507
2508         truncate_nl(line);
2509
2510         wait_for_terminate_and_warn("getent initgroups", pid);
2511
2512         /* Skip over the username and subsequent separator whitespace */
2513         x = line;
2514         x += strcspn(x, WHITESPACE);
2515         x += strspn(x, WHITESPACE);
2516
2517         FOREACH_WORD(w, l, x, state) {
2518                 char c[l+1];
2519
2520                 memcpy(c, w, l);
2521                 c[l] = 0;
2522
2523                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2524                         return log_oom();
2525
2526                 r = parse_uid(c, &uids[n_uids++]);
2527                 if (r < 0) {
2528                         log_error("Failed to parse group data from getent.");
2529                         return -EIO;
2530                 }
2531         }
2532
2533         r = mkdir_parents(home, 0775);
2534         if (r < 0) {
2535                 log_error("Failed to make home root directory: %s", strerror(-r));
2536                 return r;
2537         }
2538
2539         r = mkdir_safe(home, 0755, uid, gid);
2540         if (r < 0 && r != -EEXIST) {
2541                 log_error("Failed to make home directory: %s", strerror(-r));
2542                 return r;
2543         }
2544
2545         fchown(STDIN_FILENO, uid, gid);
2546         fchown(STDOUT_FILENO, uid, gid);
2547         fchown(STDERR_FILENO, uid, gid);
2548
2549         if (setgroups(n_uids, uids) < 0) {
2550                 log_error("Failed to set auxiliary groups: %m");
2551                 return -errno;
2552         }
2553
2554         if (setresgid(gid, gid, gid) < 0) {
2555                 log_error("setregid() failed: %m");
2556                 return -errno;
2557         }
2558
2559         if (setresuid(uid, uid, uid) < 0) {
2560                 log_error("setreuid() failed: %m");
2561                 return -errno;
2562         }
2563
2564         if (_home) {
2565                 *_home = home;
2566                 home = NULL;
2567         }
2568
2569         return 0;
2570 }
2571
2572 int main(int argc, char *argv[]) {
2573
2574         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2575         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2576         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2577         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2578         _cleanup_fdset_free_ FDSet *fds = NULL;
2579         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2580         const char *console = NULL;
2581         char veth_name[IFNAMSIZ];
2582         bool secondary = false;
2583         pid_t pid = 0;
2584         sigset_t mask;
2585
2586         log_parse_environment();
2587         log_open();
2588
2589         k = parse_argv(argc, argv);
2590         if (k < 0)
2591                 goto finish;
2592         else if (k == 0) {
2593                 r = EXIT_SUCCESS;
2594                 goto finish;
2595         }
2596
2597         if (!arg_image) {
2598                 if (arg_directory) {
2599                         char *p;
2600
2601                         p = path_make_absolute_cwd(arg_directory);
2602                         free(arg_directory);
2603                         arg_directory = p;
2604                 } else
2605                         arg_directory = get_current_dir_name();
2606
2607                 if (!arg_directory) {
2608                         log_error("Failed to determine path, please use -D.");
2609                         goto finish;
2610                 }
2611                 path_kill_slashes(arg_directory);
2612         }
2613
2614         if (!arg_machine) {
2615                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2616                 if (!arg_machine) {
2617                         log_oom();
2618                         goto finish;
2619                 }
2620
2621                 hostname_cleanup(arg_machine, false);
2622                 if (isempty(arg_machine)) {
2623                         log_error("Failed to determine machine name automatically, please use -M.");
2624                         goto finish;
2625                 }
2626         }
2627
2628         if (geteuid() != 0) {
2629                 log_error("Need to be root.");
2630                 goto finish;
2631         }
2632
2633         if (sd_booted() <= 0) {
2634                 log_error("Not running on a systemd system.");
2635                 goto finish;
2636         }
2637
2638         log_close();
2639         n_fd_passed = sd_listen_fds(false);
2640         if (n_fd_passed > 0) {
2641                 k = fdset_new_listen_fds(&fds, false);
2642                 if (k < 0) {
2643                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2644                         goto finish;
2645                 }
2646         }
2647         fdset_close_others(fds);
2648         log_open();
2649
2650         if (arg_directory) {
2651                 if (path_equal(arg_directory, "/")) {
2652                         log_error("Spawning container on root directory not supported.");
2653                         goto finish;
2654                 }
2655
2656                 if (arg_boot) {
2657                         if (path_is_os_tree(arg_directory) <= 0) {
2658                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2659                                 goto finish;
2660                         }
2661                 } else {
2662                         const char *p;
2663
2664                         p = strappenda(arg_directory,
2665                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2666                         if (access(p, F_OK) < 0) {
2667                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2668                                 goto finish;
2669
2670                         }
2671                 }
2672         } else {
2673                 char template[] = "/tmp/nspawn-root-XXXXXX";
2674
2675                 if (!mkdtemp(template)) {
2676                         log_error("Failed to create temporary directory: %m");
2677                         r = -errno;
2678                         goto finish;
2679                 }
2680
2681                 arg_directory = strdup(template);
2682                 if (!arg_directory) {
2683                         r = log_oom();
2684                         goto finish;
2685                 }
2686
2687                 image_fd = setup_image(&device_path, &loop_nr);
2688                 if (image_fd < 0) {
2689                         r = image_fd;
2690                         goto finish;
2691                 }
2692
2693                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2694                 if (r < 0)
2695                         goto finish;
2696         }
2697
2698         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2699         if (master < 0) {
2700                 log_error("Failed to acquire pseudo tty: %m");
2701                 goto finish;
2702         }
2703
2704         console = ptsname(master);
2705         if (!console) {
2706                 log_error("Failed to determine tty name: %m");
2707                 goto finish;
2708         }
2709
2710         if (!arg_quiet)
2711                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2712
2713         if (unlockpt(master) < 0) {
2714                 log_error("Failed to unlock tty: %m");
2715                 goto finish;
2716         }
2717
2718         if (access("/dev/kdbus/control", F_OK) >= 0) {
2719
2720                 if (arg_share_system) {
2721                         kdbus_domain = strdup("/dev/kdbus");
2722                         if (!kdbus_domain) {
2723                                 log_oom();
2724                                 goto finish;
2725                         }
2726                 } else {
2727                         const char *ns;
2728
2729                         ns = strappenda("machine-", arg_machine);
2730                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2731                         if (r < 0)
2732                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2733                         else
2734                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2735                 }
2736         }
2737
2738         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2739                 log_error("Failed to create kmsg socket pair: %m");
2740                 goto finish;
2741         }
2742
2743         sd_notify(0, "READY=1");
2744
2745         assert_se(sigemptyset(&mask) == 0);
2746         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2747         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2748
2749         for (;;) {
2750                 int parent_ready_fd = -1, child_ready_fd = -1;
2751                 siginfo_t status;
2752                 eventfd_t x;
2753
2754                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2755                 if (parent_ready_fd < 0) {
2756                         log_error("Failed to create event fd: %m");
2757                         goto finish;
2758                 }
2759
2760                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2761                 if (child_ready_fd < 0) {
2762                         log_error("Failed to create event fd: %m");
2763                         goto finish;
2764                 }
2765
2766                 pid = syscall(__NR_clone,
2767                               SIGCHLD|CLONE_NEWNS|
2768                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2769                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2770                 if (pid < 0) {
2771                         if (errno == EINVAL)
2772                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2773                         else
2774                                 log_error("clone() failed: %m");
2775
2776                         goto finish;
2777                 }
2778
2779                 if (pid == 0) {
2780                         /* child */
2781                         _cleanup_free_ char *home = NULL;
2782                         unsigned n_env = 2;
2783                         const char *envp[] = {
2784                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2785                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2786                                 NULL, /* TERM */
2787                                 NULL, /* HOME */
2788                                 NULL, /* USER */
2789                                 NULL, /* LOGNAME */
2790                                 NULL, /* container_uuid */
2791                                 NULL, /* LISTEN_FDS */
2792                                 NULL, /* LISTEN_PID */
2793                                 NULL
2794                         };
2795                         char **env_use;
2796
2797                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2798                         if (envp[n_env])
2799                                 n_env ++;
2800
2801                         master = safe_close(master);
2802
2803                         close_nointr(STDIN_FILENO);
2804                         close_nointr(STDOUT_FILENO);
2805                         close_nointr(STDERR_FILENO);
2806
2807                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2808
2809                         reset_all_signal_handlers();
2810
2811                         assert_se(sigemptyset(&mask) == 0);
2812                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2813
2814                         k = open_terminal(console, O_RDWR);
2815                         if (k != STDIN_FILENO) {
2816                                 if (k >= 0) {
2817                                         safe_close(k);
2818                                         k = -EINVAL;
2819                                 }
2820
2821                                 log_error("Failed to open console: %s", strerror(-k));
2822                                 goto child_fail;
2823                         }
2824
2825                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2826                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2827                                 log_error("Failed to duplicate console: %m");
2828                                 goto child_fail;
2829                         }
2830
2831                         if (setsid() < 0) {
2832                                 log_error("setsid() failed: %m");
2833                                 goto child_fail;
2834                         }
2835
2836                         if (reset_audit_loginuid() < 0)
2837                                 goto child_fail;
2838
2839                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2840                                 log_error("PR_SET_PDEATHSIG failed: %m");
2841                                 goto child_fail;
2842                         }
2843
2844                         /* Mark everything as slave, so that we still
2845                          * receive mounts from the real root, but don't
2846                          * propagate mounts to the real root. */
2847                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2848                                 log_error("MS_SLAVE|MS_REC failed: %m");
2849                                 goto child_fail;
2850                         }
2851
2852                         if (mount_devices(arg_directory,
2853                                           root_device, root_device_rw,
2854                                           home_device, home_device_rw,
2855                                           srv_device, srv_device_rw) < 0)
2856                                 goto child_fail;
2857
2858                         /* Turn directory into bind mount */
2859                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2860                                 log_error("Failed to make bind mount.");
2861                                 goto child_fail;
2862                         }
2863
2864                         if (arg_read_only)
2865                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2866                                         log_error("Failed to make read-only.");
2867                                         goto child_fail;
2868                                 }
2869
2870                         if (mount_all(arg_directory) < 0)
2871                                 goto child_fail;
2872
2873                         if (copy_devnodes(arg_directory) < 0)
2874                                 goto child_fail;
2875
2876                         if (setup_ptmx(arg_directory) < 0)
2877                                 goto child_fail;
2878
2879                         dev_setup(arg_directory);
2880
2881                         if (audit_still_doesnt_work_in_containers() < 0)
2882                                 goto child_fail;
2883
2884                         if (setup_dev_console(arg_directory, console) < 0)
2885                                 goto child_fail;
2886
2887                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2888                                 goto child_fail;
2889
2890                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2891
2892                         if (setup_boot_id(arg_directory) < 0)
2893                                 goto child_fail;
2894
2895                         if (setup_timezone(arg_directory) < 0)
2896                                 goto child_fail;
2897
2898                         if (setup_resolv_conf(arg_directory) < 0)
2899                                 goto child_fail;
2900
2901                         if (setup_journal(arg_directory) < 0)
2902                                 goto child_fail;
2903
2904                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2905                                 goto child_fail;
2906
2907                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2908                                 goto child_fail;
2909
2910                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2911                                 goto child_fail;
2912
2913                         /* Tell the parent that we are ready, and that
2914                          * it can cgroupify us to that we lack access
2915                          * to certain devices and resources. */
2916                         eventfd_write(child_ready_fd, 1);
2917                         child_ready_fd = safe_close(child_ready_fd);
2918
2919                         if (chdir(arg_directory) < 0) {
2920                                 log_error("chdir(%s) failed: %m", arg_directory);
2921                                 goto child_fail;
2922                         }
2923
2924                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2925                                 log_error("mount(MS_MOVE) failed: %m");
2926                                 goto child_fail;
2927                         }
2928
2929                         if (chroot(".") < 0) {
2930                                 log_error("chroot() failed: %m");
2931                                 goto child_fail;
2932                         }
2933
2934                         if (chdir("/") < 0) {
2935                                 log_error("chdir() failed: %m");
2936                                 goto child_fail;
2937                         }
2938
2939                         umask(0022);
2940
2941                         if (arg_private_network)
2942                                 loopback_setup();
2943
2944                         if (drop_capabilities() < 0) {
2945                                 log_error("drop_capabilities() failed: %m");
2946                                 goto child_fail;
2947                         }
2948
2949                         r = change_uid_gid(&home);
2950                         if (r < 0)
2951                                 goto child_fail;
2952
2953                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2954                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2955                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2956                                 log_oom();
2957                                 goto child_fail;
2958                         }
2959
2960                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2961                                 char as_uuid[37];
2962
2963                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
2964                                         log_oom();
2965                                         goto child_fail;
2966                                 }
2967                         }
2968
2969                         if (fdset_size(fds) > 0) {
2970                                 k = fdset_cloexec(fds, false);
2971                                 if (k < 0) {
2972                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2973                                         goto child_fail;
2974                                 }
2975
2976                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2977                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2978                                         log_oom();
2979                                         goto child_fail;
2980                                 }
2981                         }
2982
2983                         setup_hostname();
2984
2985                         if (arg_personality != 0xffffffffLU) {
2986                                 if (personality(arg_personality) < 0) {
2987                                         log_error("personality() failed: %m");
2988                                         goto child_fail;
2989                                 }
2990                         } else if (secondary) {
2991                                 if (personality(PER_LINUX32) < 0) {
2992                                         log_error("personality() failed: %m");
2993                                         goto child_fail;
2994                                 }
2995                         }
2996
2997 #ifdef HAVE_SELINUX
2998                         if (arg_selinux_context)
2999                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3000                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3001                                         goto child_fail;
3002                                 }
3003 #endif
3004
3005                         if (!strv_isempty(arg_setenv)) {
3006                                 char **n;
3007
3008                                 n = strv_env_merge(2, envp, arg_setenv);
3009                                 if (!n) {
3010                                         log_oom();
3011                                         goto child_fail;
3012                                 }
3013
3014                                 env_use = n;
3015                         } else
3016                                 env_use = (char**) envp;
3017
3018                         /* Wait until the parent is ready with the setup, too... */
3019                         eventfd_read(parent_ready_fd, &x);
3020                         parent_ready_fd = safe_close(parent_ready_fd);
3021
3022                         if (arg_boot) {
3023                                 char **a;
3024                                 size_t l;
3025
3026                                 /* Automatically search for the init system */
3027
3028                                 l = 1 + argc - optind;
3029                                 a = newa(char*, l + 1);
3030                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3031
3032                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3033                                 execve(a[0], a, env_use);
3034
3035                                 a[0] = (char*) "/lib/systemd/systemd";
3036                                 execve(a[0], a, env_use);
3037
3038                                 a[0] = (char*) "/sbin/init";
3039                                 execve(a[0], a, env_use);
3040                         } else if (argc > optind)
3041                                 execvpe(argv[optind], argv + optind, env_use);
3042                         else {
3043                                 chdir(home ? home : "/root");
3044                                 execle("/bin/bash", "-bash", NULL, env_use);
3045                                 execle("/bin/sh", "-sh", NULL, env_use);
3046                         }
3047
3048                         log_error("execv() failed: %m");
3049
3050                 child_fail:
3051                         _exit(EXIT_FAILURE);
3052                 }
3053
3054                 fdset_free(fds);
3055                 fds = NULL;
3056
3057                 /* Wait until the child reported that it is ready with
3058                  * all it needs to do with privileges. After we got
3059                  * the notification we can make the process join its
3060                  * cgroup which might limit what it can do */
3061                 eventfd_read(child_ready_fd, &x);
3062
3063                 r = register_machine(pid);
3064                 if (r < 0)
3065                         goto finish;
3066
3067                 r = move_network_interfaces(pid);
3068                 if (r < 0)
3069                         goto finish;
3070
3071                 r = setup_veth(pid, veth_name);
3072                 if (r < 0)
3073                         goto finish;
3074
3075                 r = setup_bridge(veth_name);
3076                 if (r < 0)
3077                         goto finish;
3078
3079                 r = setup_macvlan(pid);
3080                 if (r < 0)
3081                         goto finish;
3082
3083                 /* Notify the child that the parent is ready with all
3084                  * its setup, and thtat the child can now hand over
3085                  * control to the code to run inside the container. */
3086                 eventfd_write(parent_ready_fd, 1);
3087
3088                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3089                 if (k < 0) {
3090                         r = EXIT_FAILURE;
3091                         break;
3092                 }
3093
3094                 if (!arg_quiet)
3095                         putc('\n', stdout);
3096
3097                 /* Kill if it is not dead yet anyway */
3098                 terminate_machine(pid);
3099
3100                 /* Redundant, but better safe than sorry */
3101                 kill(pid, SIGKILL);
3102
3103                 k = wait_for_terminate(pid, &status);
3104                 pid = 0;
3105
3106                 if (k < 0) {
3107                         r = EXIT_FAILURE;
3108                         break;
3109                 }
3110
3111                 if (status.si_code == CLD_EXITED) {
3112                         r = status.si_status;
3113                         if (status.si_status != 0) {
3114                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3115                                 break;
3116                         }
3117
3118                         if (!arg_quiet)
3119                                 log_debug("Container %s exited successfully.", arg_machine);
3120                         break;
3121                 } else if (status.si_code == CLD_KILLED &&
3122                            status.si_status == SIGINT) {
3123
3124                         if (!arg_quiet)
3125                                 log_info("Container %s has been shut down.", arg_machine);
3126                         r = 0;
3127                         break;
3128                 } else if (status.si_code == CLD_KILLED &&
3129                            status.si_status == SIGHUP) {
3130
3131                         if (!arg_quiet)
3132                                 log_info("Container %s is being rebooted.", arg_machine);
3133                         continue;
3134                 } else if (status.si_code == CLD_KILLED ||
3135                            status.si_code == CLD_DUMPED) {
3136
3137                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3138                         r = EXIT_FAILURE;
3139                         break;
3140                 } else {
3141                         log_error("Container %s failed due to unknown reason.", arg_machine);
3142                         r = EXIT_FAILURE;
3143                         break;
3144                 }
3145         }
3146
3147 finish:
3148         loop_remove(loop_nr, &image_fd);
3149
3150         if (pid > 0)
3151                 kill(pid, SIGKILL);
3152
3153         free(arg_directory);
3154         free(arg_machine);
3155         free(arg_user);
3156         strv_free(arg_setenv);
3157         strv_free(arg_network_interfaces);
3158         strv_free(arg_network_macvlan);
3159         strv_free(arg_bind);
3160         strv_free(arg_bind_ro);
3161
3162         return r;
3163 }