chiark / gitweb /
0cd476cd9e2bbefa3bac4e2191523fc0d562f91b
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90
91 #ifdef HAVE_SECCOMP
92 #include "seccomp-util.h"
93 #endif
94
95 typedef enum ContainerStatus {
96         CONTAINER_TERMINATED,
97         CONTAINER_REBOOTED
98 } ContainerStatus;
99
100 typedef enum LinkJournal {
101         LINK_NO,
102         LINK_AUTO,
103         LINK_HOST,
104         LINK_GUEST
105 } LinkJournal;
106
107 static char *arg_directory = NULL;
108 static char *arg_user = NULL;
109 static sd_id128_t arg_uuid = {};
110 static char *arg_machine = NULL;
111 static const char *arg_selinux_context = NULL;
112 static const char *arg_selinux_apifs_context = NULL;
113 static const char *arg_slice = NULL;
114 static bool arg_private_network = false;
115 static bool arg_read_only = false;
116 static bool arg_boot = false;
117 static LinkJournal arg_link_journal = LINK_AUTO;
118 static uint64_t arg_retain =
119         (1ULL << CAP_CHOWN) |
120         (1ULL << CAP_DAC_OVERRIDE) |
121         (1ULL << CAP_DAC_READ_SEARCH) |
122         (1ULL << CAP_FOWNER) |
123         (1ULL << CAP_FSETID) |
124         (1ULL << CAP_IPC_OWNER) |
125         (1ULL << CAP_KILL) |
126         (1ULL << CAP_LEASE) |
127         (1ULL << CAP_LINUX_IMMUTABLE) |
128         (1ULL << CAP_NET_BIND_SERVICE) |
129         (1ULL << CAP_NET_BROADCAST) |
130         (1ULL << CAP_NET_RAW) |
131         (1ULL << CAP_SETGID) |
132         (1ULL << CAP_SETFCAP) |
133         (1ULL << CAP_SETPCAP) |
134         (1ULL << CAP_SETUID) |
135         (1ULL << CAP_SYS_ADMIN) |
136         (1ULL << CAP_SYS_CHROOT) |
137         (1ULL << CAP_SYS_NICE) |
138         (1ULL << CAP_SYS_PTRACE) |
139         (1ULL << CAP_SYS_TTY_CONFIG) |
140         (1ULL << CAP_SYS_RESOURCE) |
141         (1ULL << CAP_SYS_BOOT) |
142         (1ULL << CAP_AUDIT_WRITE) |
143         (1ULL << CAP_AUDIT_CONTROL) |
144         (1ULL << CAP_MKNOD);
145 static char **arg_bind = NULL;
146 static char **arg_bind_ro = NULL;
147 static char **arg_setenv = NULL;
148 static bool arg_quiet = false;
149 static bool arg_share_system = false;
150 static bool arg_register = true;
151 static bool arg_keep_unit = false;
152 static char **arg_network_interfaces = NULL;
153 static char **arg_network_macvlan = NULL;
154 static bool arg_network_veth = false;
155 static const char *arg_network_bridge = NULL;
156 static unsigned long arg_personality = 0xffffffffLU;
157 static const char *arg_image = NULL;
158
159 static int help(void) {
160
161         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
162                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
163                "  -h --help                 Show this help\n"
164                "     --version              Print version string\n"
165                "  -q --quiet                Do not show status information\n"
166                "  -D --directory=PATH       Root directory for the container\n"
167                "  -i --image=PATH           File system device or image for the container\n"
168                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
169                "  -u --user=USER            Run the command under specified user or uid\n"
170                "  -M --machine=NAME         Set the machine name for the container\n"
171                "     --uuid=UUID            Set a specific machine UUID for the container\n"
172                "  -S --slice=SLICE          Place the container in the specified slice\n"
173                "     --private-network      Disable network in container\n"
174                "     --network-interface=INTERFACE\n"
175                "                            Assign an existing network interface to the\n"
176                "                            container\n"
177                "     --network-macvlan=INTERFACE\n"
178                "                            Create a macvlan network interface based on an\n"
179                "                            existing network interface to the container\n"
180                "     --network-veth         Add a virtual ethernet connection between host\n"
181                "                            and container\n"
182                "     --network-bridge=INTERFACE\n"
183                "                            Add a virtual ethernet connection between host\n"
184                "                            and container and add it to an existing bridge on\n"
185                "                            the host\n"
186                "  -Z --selinux-context=SECLABEL\n"
187                "                            Set the SELinux security context to be used by\n"
188                "                            processes in the container\n"
189                "  -L --selinux-apifs-context=SECLABEL\n"
190                "                            Set the SELinux security context to be used by\n"
191                "                            API/tmpfs file systems in the container\n"
192                "     --capability=CAP       In addition to the default, retain specified\n"
193                "                            capability\n"
194                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
195                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
196                "  -j                        Equivalent to --link-journal=host\n"
197                "     --read-only            Mount the root directory read-only\n"
198                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
199                "                            the container\n"
200                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
201                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
202                "     --share-system         Share system namespaces with host\n"
203                "     --register=BOOLEAN     Register container as machine\n"
204                "     --keep-unit            Do not register a scope for the machine, reuse\n"
205                "                            the service unit nspawn is running in\n",
206                program_invocation_short_name);
207
208         return 0;
209 }
210
211 static int parse_argv(int argc, char *argv[]) {
212
213         enum {
214                 ARG_VERSION = 0x100,
215                 ARG_PRIVATE_NETWORK,
216                 ARG_UUID,
217                 ARG_READ_ONLY,
218                 ARG_CAPABILITY,
219                 ARG_DROP_CAPABILITY,
220                 ARG_LINK_JOURNAL,
221                 ARG_BIND,
222                 ARG_BIND_RO,
223                 ARG_SETENV,
224                 ARG_SHARE_SYSTEM,
225                 ARG_REGISTER,
226                 ARG_KEEP_UNIT,
227                 ARG_NETWORK_INTERFACE,
228                 ARG_NETWORK_MACVLAN,
229                 ARG_NETWORK_VETH,
230                 ARG_NETWORK_BRIDGE,
231                 ARG_PERSONALITY,
232         };
233
234         static const struct option options[] = {
235                 { "help",                  no_argument,       NULL, 'h'                   },
236                 { "version",               no_argument,       NULL, ARG_VERSION           },
237                 { "directory",             required_argument, NULL, 'D'                   },
238                 { "user",                  required_argument, NULL, 'u'                   },
239                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
240                 { "boot",                  no_argument,       NULL, 'b'                   },
241                 { "uuid",                  required_argument, NULL, ARG_UUID              },
242                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
243                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
244                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
245                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
246                 { "bind",                  required_argument, NULL, ARG_BIND              },
247                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
248                 { "machine",               required_argument, NULL, 'M'                   },
249                 { "slice",                 required_argument, NULL, 'S'                   },
250                 { "setenv",                required_argument, NULL, ARG_SETENV            },
251                 { "selinux-context",       required_argument, NULL, 'Z'                   },
252                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
253                 { "quiet",                 no_argument,       NULL, 'q'                   },
254                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
255                 { "register",              required_argument, NULL, ARG_REGISTER          },
256                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
257                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
258                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
259                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
260                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
261                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
262                 { "image",                 required_argument, NULL, 'i'                   },
263                 {}
264         };
265
266         int c, r;
267         uint64_t plus = 0, minus = 0;
268
269         assert(argc >= 0);
270         assert(argv);
271
272         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
273
274                 switch (c) {
275
276                 case 'h':
277                         return help();
278
279                 case ARG_VERSION:
280                         puts(PACKAGE_STRING);
281                         puts(SYSTEMD_FEATURES);
282                         return 0;
283
284                 case 'D':
285                         free(arg_directory);
286                         arg_directory = canonicalize_file_name(optarg);
287                         if (!arg_directory) {
288                                 log_error("Invalid root directory: %m");
289                                 return -ENOMEM;
290                         }
291
292                         break;
293
294                 case 'i':
295                         arg_image = optarg;
296                         break;
297
298                 case 'u':
299                         free(arg_user);
300                         arg_user = strdup(optarg);
301                         if (!arg_user)
302                                 return log_oom();
303
304                         break;
305
306                 case ARG_NETWORK_BRIDGE:
307                         arg_network_bridge = optarg;
308
309                         /* fall through */
310
311                 case ARG_NETWORK_VETH:
312                         arg_network_veth = true;
313                         arg_private_network = true;
314                         break;
315
316                 case ARG_NETWORK_INTERFACE:
317                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
318                                 return log_oom();
319
320                         arg_private_network = true;
321                         break;
322
323                 case ARG_NETWORK_MACVLAN:
324                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
325                                 return log_oom();
326
327                         /* fall through */
328
329                 case ARG_PRIVATE_NETWORK:
330                         arg_private_network = true;
331                         break;
332
333                 case 'b':
334                         arg_boot = true;
335                         break;
336
337                 case ARG_UUID:
338                         r = sd_id128_from_string(optarg, &arg_uuid);
339                         if (r < 0) {
340                                 log_error("Invalid UUID: %s", optarg);
341                                 return r;
342                         }
343                         break;
344
345                 case 'S':
346                         arg_slice = optarg;
347                         break;
348
349                 case 'M':
350                         if (isempty(optarg)) {
351                                 free(arg_machine);
352                                 arg_machine = NULL;
353                         } else {
354
355                                 if (!hostname_is_valid(optarg)) {
356                                         log_error("Invalid machine name: %s", optarg);
357                                         return -EINVAL;
358                                 }
359
360                                 free(arg_machine);
361                                 arg_machine = strdup(optarg);
362                                 if (!arg_machine)
363                                         return log_oom();
364
365                                 break;
366                         }
367
368                 case 'Z':
369                         arg_selinux_context = optarg;
370                         break;
371
372                 case 'L':
373                         arg_selinux_apifs_context = optarg;
374                         break;
375
376                 case ARG_READ_ONLY:
377                         arg_read_only = true;
378                         break;
379
380                 case ARG_CAPABILITY:
381                 case ARG_DROP_CAPABILITY: {
382                         char *state, *word;
383                         size_t length;
384
385                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
386                                 _cleanup_free_ char *t;
387                                 cap_value_t cap;
388
389                                 t = strndup(word, length);
390                                 if (!t)
391                                         return log_oom();
392
393                                 if (streq(t, "all")) {
394                                         if (c == ARG_CAPABILITY)
395                                                 plus = (uint64_t) -1;
396                                         else
397                                                 minus = (uint64_t) -1;
398                                 } else {
399                                         if (cap_from_name(t, &cap) < 0) {
400                                                 log_error("Failed to parse capability %s.", t);
401                                                 return -EINVAL;
402                                         }
403
404                                         if (c == ARG_CAPABILITY)
405                                                 plus |= 1ULL << (uint64_t) cap;
406                                         else
407                                                 minus |= 1ULL << (uint64_t) cap;
408                                 }
409                         }
410
411                         break;
412                 }
413
414                 case 'j':
415                         arg_link_journal = LINK_GUEST;
416                         break;
417
418                 case ARG_LINK_JOURNAL:
419                         if (streq(optarg, "auto"))
420                                 arg_link_journal = LINK_AUTO;
421                         else if (streq(optarg, "no"))
422                                 arg_link_journal = LINK_NO;
423                         else if (streq(optarg, "guest"))
424                                 arg_link_journal = LINK_GUEST;
425                         else if (streq(optarg, "host"))
426                                 arg_link_journal = LINK_HOST;
427                         else {
428                                 log_error("Failed to parse link journal mode %s", optarg);
429                                 return -EINVAL;
430                         }
431
432                         break;
433
434                 case ARG_BIND:
435                 case ARG_BIND_RO: {
436                         _cleanup_free_ char *a = NULL, *b = NULL;
437                         char *e;
438                         char ***x;
439
440                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
441
442                         e = strchr(optarg, ':');
443                         if (e) {
444                                 a = strndup(optarg, e - optarg);
445                                 b = strdup(e + 1);
446                         } else {
447                                 a = strdup(optarg);
448                                 b = strdup(optarg);
449                         }
450
451                         if (!a || !b)
452                                 return log_oom();
453
454                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
455                                 log_error("Invalid bind mount specification: %s", optarg);
456                                 return -EINVAL;
457                         }
458
459                         r = strv_extend(x, a);
460                         if (r < 0)
461                                 return log_oom();
462
463                         r = strv_extend(x, b);
464                         if (r < 0)
465                                 return log_oom();
466
467                         break;
468                 }
469
470                 case ARG_SETENV: {
471                         char **n;
472
473                         if (!env_assignment_is_valid(optarg)) {
474                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
475                                 return -EINVAL;
476                         }
477
478                         n = strv_env_set(arg_setenv, optarg);
479                         if (!n)
480                                 return log_oom();
481
482                         strv_free(arg_setenv);
483                         arg_setenv = n;
484                         break;
485                 }
486
487                 case 'q':
488                         arg_quiet = true;
489                         break;
490
491                 case ARG_SHARE_SYSTEM:
492                         arg_share_system = true;
493                         break;
494
495                 case ARG_REGISTER:
496                         r = parse_boolean(optarg);
497                         if (r < 0) {
498                                 log_error("Failed to parse --register= argument: %s", optarg);
499                                 return r;
500                         }
501
502                         arg_register = r;
503                         break;
504
505                 case ARG_KEEP_UNIT:
506                         arg_keep_unit = true;
507                         break;
508
509                 case ARG_PERSONALITY:
510
511                         arg_personality = personality_from_string(optarg);
512                         if (arg_personality == 0xffffffffLU) {
513                                 log_error("Unknown or unsupported personality '%s'.", optarg);
514                                 return -EINVAL;
515                         }
516
517                         break;
518
519                 case '?':
520                         return -EINVAL;
521
522                 default:
523                         assert_not_reached("Unhandled option");
524                 }
525         }
526
527         if (arg_share_system)
528                 arg_register = false;
529
530         if (arg_boot && arg_share_system) {
531                 log_error("--boot and --share-system may not be combined.");
532                 return -EINVAL;
533         }
534
535         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
536                 log_error("--keep-unit may not be used when invoked from a user session.");
537                 return -EINVAL;
538         }
539
540         if (arg_directory && arg_image) {
541                 log_error("--directory= and --image= may not be combined.");
542                 return -EINVAL;
543         }
544
545         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
546
547         return 1;
548 }
549
550 static int mount_all(const char *dest) {
551
552         typedef struct MountPoint {
553                 const char *what;
554                 const char *where;
555                 const char *type;
556                 const char *options;
557                 unsigned long flags;
558                 bool fatal;
559         } MountPoint;
560
561         static const MountPoint mount_table[] = {
562                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
563                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
564                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
565                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
566                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
567                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
568                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
569                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
570 #ifdef HAVE_SELINUX
571                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
572                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
573 #endif
574         };
575
576         unsigned k;
577         int r = 0;
578
579         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
580                 _cleanup_free_ char *where = NULL;
581 #ifdef HAVE_SELINUX
582                 _cleanup_free_ char *options = NULL;
583 #endif
584                 const char *o;
585                 int t;
586
587                 where = strjoin(dest, "/", mount_table[k].where, NULL);
588                 if (!where)
589                         return log_oom();
590
591                 t = path_is_mount_point(where, true);
592                 if (t < 0) {
593                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
594
595                         if (r == 0)
596                                 r = t;
597
598                         continue;
599                 }
600
601                 /* Skip this entry if it is not a remount. */
602                 if (mount_table[k].what && t > 0)
603                         continue;
604
605                 mkdir_p(where, 0755);
606
607 #ifdef HAVE_SELINUX
608                 if (arg_selinux_apifs_context &&
609                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
610                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
611                         if (!options)
612                                 return log_oom();
613
614                         o = options;
615                 } else
616 #endif
617                         o = mount_table[k].options;
618
619
620                 if (mount(mount_table[k].what,
621                           where,
622                           mount_table[k].type,
623                           mount_table[k].flags,
624                           o) < 0 &&
625                     mount_table[k].fatal) {
626
627                         log_error("mount(%s) failed: %m", where);
628
629                         if (r == 0)
630                                 r = -errno;
631                 }
632         }
633
634         return r;
635 }
636
637 static int mount_binds(const char *dest, char **l, unsigned long flags) {
638         char **x, **y;
639
640         STRV_FOREACH_PAIR(x, y, l) {
641                 char *where;
642                 struct stat source_st, dest_st;
643                 int r;
644
645                 if (stat(*x, &source_st) < 0) {
646                         log_error("Failed to stat %s: %m", *x);
647                         return -errno;
648                 }
649
650                 where = strappenda(dest, *y);
651                 r = stat(where, &dest_st);
652                 if (r == 0) {
653                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
654                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
655                                                 *x, where);
656                                 return -EINVAL;
657                         }
658                 } else if (errno == ENOENT) {
659                         r = mkdir_parents_label(where, 0755);
660                         if (r < 0) {
661                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
662                                 return r;
663                         }
664                 } else {
665                         log_error("Failed to bind mount %s: %m", *x);
666                         return -errno;
667                 }
668                 /* Create the mount point, but be conservative -- refuse to create block
669                 * and char devices. */
670                 if (S_ISDIR(source_st.st_mode))
671                         mkdir_label(where, 0755);
672                 else if (S_ISFIFO(source_st.st_mode))
673                         mkfifo(where, 0644);
674                 else if (S_ISSOCK(source_st.st_mode))
675                         mknod(where, 0644 | S_IFSOCK, 0);
676                 else if (S_ISREG(source_st.st_mode))
677                         touch(where);
678                 else {
679                         log_error("Refusing to create mountpoint for file: %s", *x);
680                         return -ENOTSUP;
681                 }
682
683                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
684                         log_error("mount(%s) failed: %m", where);
685                         return -errno;
686                 }
687
688                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
689                         log_error("mount(%s) failed: %m", where);
690                         return -errno;
691                 }
692         }
693
694         return 0;
695 }
696
697 static int setup_timezone(const char *dest) {
698         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
699         char *z, *y;
700         int r;
701
702         assert(dest);
703
704         /* Fix the timezone, if possible */
705         r = readlink_malloc("/etc/localtime", &p);
706         if (r < 0) {
707                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
708                 return 0;
709         }
710
711         z = path_startswith(p, "../usr/share/zoneinfo/");
712         if (!z)
713                 z = path_startswith(p, "/usr/share/zoneinfo/");
714         if (!z) {
715                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
716                 return 0;
717         }
718
719         where = strappend(dest, "/etc/localtime");
720         if (!where)
721                 return log_oom();
722
723         r = readlink_malloc(where, &q);
724         if (r >= 0) {
725                 y = path_startswith(q, "../usr/share/zoneinfo/");
726                 if (!y)
727                         y = path_startswith(q, "/usr/share/zoneinfo/");
728
729
730                 /* Already pointing to the right place? Then do nothing .. */
731                 if (y && streq(y, z))
732                         return 0;
733         }
734
735         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
736         if (!check)
737                 return log_oom();
738
739         if (access(check, F_OK) < 0) {
740                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
741                 return 0;
742         }
743
744         what = strappend("../usr/share/zoneinfo/", z);
745         if (!what)
746                 return log_oom();
747
748         unlink(where);
749         if (symlink(what, where) < 0) {
750                 log_error("Failed to correct timezone of container: %m");
751                 return 0;
752         }
753
754         return 0;
755 }
756
757 static int setup_resolv_conf(const char *dest) {
758         char _cleanup_free_ *where = NULL;
759
760         assert(dest);
761
762         if (arg_private_network)
763                 return 0;
764
765         /* Fix resolv.conf, if possible */
766         where = strappend(dest, "/etc/resolv.conf");
767         if (!where)
768                 return log_oom();
769
770         /* We don't really care for the results of this really. If it
771          * fails, it fails, but meh... */
772         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
773
774         return 0;
775 }
776
777 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
778
779         snprintf(s, 37,
780                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
781                  SD_ID128_FORMAT_VAL(id));
782
783         return s;
784 }
785
786 static int setup_boot_id(const char *dest) {
787         _cleanup_free_ char *from = NULL, *to = NULL;
788         sd_id128_t rnd = {};
789         char as_uuid[37];
790         int r;
791
792         assert(dest);
793
794         if (arg_share_system)
795                 return 0;
796
797         /* Generate a new randomized boot ID, so that each boot-up of
798          * the container gets a new one */
799
800         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
801         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
802         if (!from || !to)
803                 return log_oom();
804
805         r = sd_id128_randomize(&rnd);
806         if (r < 0) {
807                 log_error("Failed to generate random boot id: %s", strerror(-r));
808                 return r;
809         }
810
811         id128_format_as_uuid(rnd, as_uuid);
812
813         r = write_string_file(from, as_uuid);
814         if (r < 0) {
815                 log_error("Failed to write boot id: %s", strerror(-r));
816                 return r;
817         }
818
819         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
820                 log_error("Failed to bind mount boot id: %m");
821                 r = -errno;
822         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
823                 log_warning("Failed to make boot id read-only: %m");
824
825         unlink(from);
826         return r;
827 }
828
829 static int copy_devnodes(const char *dest) {
830
831         static const char devnodes[] =
832                 "null\0"
833                 "zero\0"
834                 "full\0"
835                 "random\0"
836                 "urandom\0"
837                 "tty\0";
838
839         const char *d;
840         int r = 0;
841         _cleanup_umask_ mode_t u;
842
843         assert(dest);
844
845         u = umask(0000);
846
847         NULSTR_FOREACH(d, devnodes) {
848                 _cleanup_free_ char *from = NULL, *to = NULL;
849                 struct stat st;
850
851                 from = strappend("/dev/", d);
852                 to = strjoin(dest, "/dev/", d, NULL);
853                 if (!from || !to)
854                         return log_oom();
855
856                 if (stat(from, &st) < 0) {
857
858                         if (errno != ENOENT) {
859                                 log_error("Failed to stat %s: %m", from);
860                                 return -errno;
861                         }
862
863                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
864
865                         log_error("%s is not a char or block device, cannot copy", from);
866                         return -EIO;
867
868                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
869
870                         log_error("mknod(%s) failed: %m", dest);
871                         return  -errno;
872                 }
873         }
874
875         return r;
876 }
877
878 static int setup_ptmx(const char *dest) {
879         _cleanup_free_ char *p = NULL;
880
881         p = strappend(dest, "/dev/ptmx");
882         if (!p)
883                 return log_oom();
884
885         if (symlink("pts/ptmx", p) < 0) {
886                 log_error("Failed to create /dev/ptmx symlink: %m");
887                 return -errno;
888         }
889
890         return 0;
891 }
892
893 static int setup_dev_console(const char *dest, const char *console) {
894         _cleanup_umask_ mode_t u;
895         const char *to;
896         struct stat st;
897         int r;
898
899         assert(dest);
900         assert(console);
901
902         u = umask(0000);
903
904         if (stat("/dev/null", &st) < 0) {
905                 log_error("Failed to stat /dev/null: %m");
906                 return -errno;
907         }
908
909         r = chmod_and_chown(console, 0600, 0, 0);
910         if (r < 0) {
911                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
912                 return r;
913         }
914
915         /* We need to bind mount the right tty to /dev/console since
916          * ptys can only exist on pts file systems. To have something
917          * to bind mount things on we create a device node first, and
918          * use /dev/null for that since we the cgroups device policy
919          * allows us to create that freely, while we cannot create
920          * /dev/console. (Note that the major minor doesn't actually
921          * matter here, since we mount it over anyway). */
922
923         to = strappenda(dest, "/dev/console");
924         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
925                 log_error("mknod() for /dev/console failed: %m");
926                 return -errno;
927         }
928
929         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
930                 log_error("Bind mount for /dev/console failed: %m");
931                 return -errno;
932         }
933
934         return 0;
935 }
936
937 static int setup_kmsg(const char *dest, int kmsg_socket) {
938         _cleanup_free_ char *from = NULL, *to = NULL;
939         int r, fd, k;
940         _cleanup_umask_ mode_t u;
941         union {
942                 struct cmsghdr cmsghdr;
943                 uint8_t buf[CMSG_SPACE(sizeof(int))];
944         } control = {};
945         struct msghdr mh = {
946                 .msg_control = &control,
947                 .msg_controllen = sizeof(control),
948         };
949         struct cmsghdr *cmsg;
950
951         assert(dest);
952         assert(kmsg_socket >= 0);
953
954         u = umask(0000);
955
956         /* We create the kmsg FIFO as /dev/kmsg, but immediately
957          * delete it after bind mounting it to /proc/kmsg. While FIFOs
958          * on the reading side behave very similar to /proc/kmsg,
959          * their writing side behaves differently from /dev/kmsg in
960          * that writing blocks when nothing is reading. In order to
961          * avoid any problems with containers deadlocking due to this
962          * we simply make /dev/kmsg unavailable to the container. */
963         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
964             asprintf(&to, "%s/proc/kmsg", dest) < 0)
965                 return log_oom();
966
967         if (mkfifo(from, 0600) < 0) {
968                 log_error("mkfifo() for /dev/kmsg failed: %m");
969                 return -errno;
970         }
971
972         r = chmod_and_chown(from, 0600, 0, 0);
973         if (r < 0) {
974                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
975                 return r;
976         }
977
978         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
979                 log_error("Bind mount for /proc/kmsg failed: %m");
980                 return -errno;
981         }
982
983         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
984         if (fd < 0) {
985                 log_error("Failed to open fifo: %m");
986                 return -errno;
987         }
988
989         cmsg = CMSG_FIRSTHDR(&mh);
990         cmsg->cmsg_level = SOL_SOCKET;
991         cmsg->cmsg_type = SCM_RIGHTS;
992         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
993         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
994
995         mh.msg_controllen = cmsg->cmsg_len;
996
997         /* Store away the fd in the socket, so that it stays open as
998          * long as we run the child */
999         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1000         safe_close(fd);
1001
1002         if (k < 0) {
1003                 log_error("Failed to send FIFO fd: %m");
1004                 return -errno;
1005         }
1006
1007         /* And now make the FIFO unavailable as /dev/kmsg... */
1008         unlink(from);
1009         return 0;
1010 }
1011
1012 static int setup_hostname(void) {
1013
1014         if (arg_share_system)
1015                 return 0;
1016
1017         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1018                 return -errno;
1019
1020         return 0;
1021 }
1022
1023 static int setup_journal(const char *directory) {
1024         sd_id128_t machine_id, this_id;
1025         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1026         char *id;
1027         int r;
1028
1029         p = strappend(directory, "/etc/machine-id");
1030         if (!p)
1031                 return log_oom();
1032
1033         r = read_one_line_file(p, &b);
1034         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1035                 return 0;
1036         else if (r < 0) {
1037                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1038                 return r;
1039         }
1040
1041         id = strstrip(b);
1042         if (isempty(id) && arg_link_journal == LINK_AUTO)
1043                 return 0;
1044
1045         /* Verify validity */
1046         r = sd_id128_from_string(id, &machine_id);
1047         if (r < 0) {
1048                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1049                 return r;
1050         }
1051
1052         r = sd_id128_get_machine(&this_id);
1053         if (r < 0) {
1054                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1055                 return r;
1056         }
1057
1058         if (sd_id128_equal(machine_id, this_id)) {
1059                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1060                          "Host and machine ids are equal (%s): refusing to link journals", id);
1061                 if (arg_link_journal == LINK_AUTO)
1062                         return 0;
1063                 return
1064                         -EEXIST;
1065         }
1066
1067         if (arg_link_journal == LINK_NO)
1068                 return 0;
1069
1070         free(p);
1071         p = strappend("/var/log/journal/", id);
1072         q = strjoin(directory, "/var/log/journal/", id, NULL);
1073         if (!p || !q)
1074                 return log_oom();
1075
1076         if (path_is_mount_point(p, false) > 0) {
1077                 if (arg_link_journal != LINK_AUTO) {
1078                         log_error("%s: already a mount point, refusing to use for journal", p);
1079                         return -EEXIST;
1080                 }
1081
1082                 return 0;
1083         }
1084
1085         if (path_is_mount_point(q, false) > 0) {
1086                 if (arg_link_journal != LINK_AUTO) {
1087                         log_error("%s: already a mount point, refusing to use for journal", q);
1088                         return -EEXIST;
1089                 }
1090
1091                 return 0;
1092         }
1093
1094         r = readlink_and_make_absolute(p, &d);
1095         if (r >= 0) {
1096                 if ((arg_link_journal == LINK_GUEST ||
1097                      arg_link_journal == LINK_AUTO) &&
1098                     path_equal(d, q)) {
1099
1100                         r = mkdir_p(q, 0755);
1101                         if (r < 0)
1102                                 log_warning("failed to create directory %s: %m", q);
1103                         return 0;
1104                 }
1105
1106                 if (unlink(p) < 0) {
1107                         log_error("Failed to remove symlink %s: %m", p);
1108                         return -errno;
1109                 }
1110         } else if (r == -EINVAL) {
1111
1112                 if (arg_link_journal == LINK_GUEST &&
1113                     rmdir(p) < 0) {
1114
1115                         if (errno == ENOTDIR) {
1116                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1117                                 return r;
1118                         } else {
1119                                 log_error("Failed to remove %s: %m", p);
1120                                 return -errno;
1121                         }
1122                 }
1123         } else if (r != -ENOENT) {
1124                 log_error("readlink(%s) failed: %m", p);
1125                 return r;
1126         }
1127
1128         if (arg_link_journal == LINK_GUEST) {
1129
1130                 if (symlink(q, p) < 0) {
1131                         log_error("Failed to symlink %s to %s: %m", q, p);
1132                         return -errno;
1133                 }
1134
1135                 r = mkdir_p(q, 0755);
1136                 if (r < 0)
1137                         log_warning("failed to create directory %s: %m", q);
1138                 return 0;
1139         }
1140
1141         if (arg_link_journal == LINK_HOST) {
1142                 r = mkdir_p(p, 0755);
1143                 if (r < 0) {
1144                         log_error("Failed to create %s: %m", p);
1145                         return r;
1146                 }
1147
1148         } else if (access(p, F_OK) < 0)
1149                 return 0;
1150
1151         if (dir_is_empty(q) == 0)
1152                 log_warning("%s is not empty, proceeding anyway.", q);
1153
1154         r = mkdir_p(q, 0755);
1155         if (r < 0) {
1156                 log_error("Failed to create %s: %m", q);
1157                 return r;
1158         }
1159
1160         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1161                 log_error("Failed to bind mount journal from host into guest: %m");
1162                 return -errno;
1163         }
1164
1165         return 0;
1166 }
1167
1168 static int setup_kdbus(const char *dest, const char *path) {
1169         const char *p;
1170
1171         if (!path)
1172                 return 0;
1173
1174         p = strappenda(dest, "/dev/kdbus");
1175         if (mkdir(p, 0755) < 0) {
1176                 log_error("Failed to create kdbus path: %m");
1177                 return  -errno;
1178         }
1179
1180         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1181                 log_error("Failed to mount kdbus domain path: %m");
1182                 return -errno;
1183         }
1184
1185         return 0;
1186 }
1187
1188 static int drop_capabilities(void) {
1189         return capability_bounding_set_drop(~arg_retain, false);
1190 }
1191
1192 static int register_machine(pid_t pid) {
1193         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1194         _cleanup_bus_unref_ sd_bus *bus = NULL;
1195         int r;
1196
1197         if (!arg_register)
1198                 return 0;
1199
1200         r = sd_bus_default_system(&bus);
1201         if (r < 0) {
1202                 log_error("Failed to open system bus: %s", strerror(-r));
1203                 return r;
1204         }
1205
1206         if (arg_keep_unit) {
1207                 r = sd_bus_call_method(
1208                                 bus,
1209                                 "org.freedesktop.machine1",
1210                                 "/org/freedesktop/machine1",
1211                                 "org.freedesktop.machine1.Manager",
1212                                 "RegisterMachine",
1213                                 &error,
1214                                 NULL,
1215                                 "sayssus",
1216                                 arg_machine,
1217                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1218                                 "nspawn",
1219                                 "container",
1220                                 (uint32_t) pid,
1221                                 strempty(arg_directory));
1222         } else {
1223                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1224
1225                 r = sd_bus_message_new_method_call(
1226                                 bus,
1227                                 &m,
1228                                 "org.freedesktop.machine1",
1229                                 "/org/freedesktop/machine1",
1230                                 "org.freedesktop.machine1.Manager",
1231                                 "CreateMachine");
1232                 if (r < 0) {
1233                         log_error("Failed to create message: %s", strerror(-r));
1234                         return r;
1235                 }
1236
1237                 r = sd_bus_message_append(
1238                                 m,
1239                                 "sayssus",
1240                                 arg_machine,
1241                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1242                                 "nspawn",
1243                                 "container",
1244                                 (uint32_t) pid,
1245                                 strempty(arg_directory));
1246                 if (r < 0) {
1247                         log_error("Failed to append message arguments: %s", strerror(-r));
1248                         return r;
1249                 }
1250
1251                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1252                 if (r < 0) {
1253                         log_error("Failed to open container: %s", strerror(-r));
1254                         return r;
1255                 }
1256
1257                 if (!isempty(arg_slice)) {
1258                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1259                         if (r < 0) {
1260                                 log_error("Failed to append slice: %s", strerror(-r));
1261                                 return r;
1262                         }
1263                 }
1264
1265                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1266                 if (r < 0) {
1267                         log_error("Failed to add device policy: %s", strerror(-r));
1268                         return r;
1269                 }
1270
1271                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1272                                           /* Allow the container to
1273                                            * access and create the API
1274                                            * device nodes, so that
1275                                            * PrivateDevices= in the
1276                                            * container can work
1277                                            * fine */
1278                                           "/dev/null", "rwm",
1279                                           "/dev/zero", "rwm",
1280                                           "/dev/full", "rwm",
1281                                           "/dev/random", "rwm",
1282                                           "/dev/urandom", "rwm",
1283                                           "/dev/tty", "rwm",
1284                                           /* Allow the container
1285                                            * access to ptys. However,
1286                                            * do not permit the
1287                                            * container to ever create
1288                                            * these device nodes. */
1289                                           "/dev/pts/ptmx", "rw",
1290                                           "char-pts", "rw",
1291                                           /* Allow the container
1292                                            * access to all kdbus
1293                                            * devices. Again, the
1294                                            * container cannot create
1295                                            * these nodes, only use
1296                                            * them. We use a pretty
1297                                            * open match here, so that
1298                                            * the kernel API can still
1299                                            * change. */
1300                                           "char-kdbus", "rw",
1301                                           "char-kdbus/*", "rw");
1302                 if (r < 0) {
1303                         log_error("Failed to add device whitelist: %s", strerror(-r));
1304                         return r;
1305                 }
1306
1307                 r = sd_bus_message_close_container(m);
1308                 if (r < 0) {
1309                         log_error("Failed to close container: %s", strerror(-r));
1310                         return r;
1311                 }
1312
1313                 r = sd_bus_call(bus, m, 0, &error, NULL);
1314         }
1315
1316         if (r < 0) {
1317                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1318                 return r;
1319         }
1320
1321         return 0;
1322 }
1323
1324 static int terminate_machine(pid_t pid) {
1325         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1326         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1327         _cleanup_bus_unref_ sd_bus *bus = NULL;
1328         const char *path;
1329         int r;
1330
1331         if (!arg_register)
1332                 return 0;
1333
1334         r = sd_bus_default_system(&bus);
1335         if (r < 0) {
1336                 log_error("Failed to open system bus: %s", strerror(-r));
1337                 return r;
1338         }
1339
1340         r = sd_bus_call_method(
1341                         bus,
1342                         "org.freedesktop.machine1",
1343                         "/org/freedesktop/machine1",
1344                         "org.freedesktop.machine1.Manager",
1345                         "GetMachineByPID",
1346                         &error,
1347                         &reply,
1348                         "u",
1349                         (uint32_t) pid);
1350         if (r < 0) {
1351                 /* Note that the machine might already have been
1352                  * cleaned up automatically, hence don't consider it a
1353                  * failure if we cannot get the machine object. */
1354                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1355                 return 0;
1356         }
1357
1358         r = sd_bus_message_read(reply, "o", &path);
1359         if (r < 0)
1360                 return bus_log_parse_error(r);
1361
1362         r = sd_bus_call_method(
1363                         bus,
1364                         "org.freedesktop.machine1",
1365                         path,
1366                         "org.freedesktop.machine1.Machine",
1367                         "Terminate",
1368                         &error,
1369                         NULL,
1370                         NULL);
1371         if (r < 0) {
1372                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1373                 return 0;
1374         }
1375
1376         return 0;
1377 }
1378
1379 static int reset_audit_loginuid(void) {
1380         _cleanup_free_ char *p = NULL;
1381         int r;
1382
1383         if (arg_share_system)
1384                 return 0;
1385
1386         r = read_one_line_file("/proc/self/loginuid", &p);
1387         if (r == -ENOENT)
1388                 return 0;
1389         if (r < 0) {
1390                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1391                 return r;
1392         }
1393
1394         /* Already reset? */
1395         if (streq(p, "4294967295"))
1396                 return 0;
1397
1398         r = write_string_file("/proc/self/loginuid", "4294967295");
1399         if (r < 0) {
1400                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1401                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1402                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1403                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1404                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1405
1406                 sleep(5);
1407         }
1408
1409         return 0;
1410 }
1411
1412 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1413
1414 static int get_mac(struct ether_addr *mac) {
1415         int r;
1416
1417         uint8_t result[8];
1418         size_t l, sz;
1419         uint8_t *v;
1420
1421         l = strlen(arg_machine);
1422         sz = sizeof(sd_id128_t) + l;
1423         v = alloca(sz);
1424
1425         /* fetch some persistent data unique to the host */
1426         r = sd_id128_get_machine((sd_id128_t*) v);
1427         if (r < 0)
1428                 return r;
1429
1430         /* combine with some data unique (on this host) to this
1431          * container instance */
1432         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1433
1434         /* Let's hash the host machine ID plus the container name. We
1435          * use a fixed, but originally randomly created hash key here. */
1436         siphash24(result, v, sz, HASH_KEY.bytes);
1437
1438         assert_cc(ETH_ALEN <= sizeof(result));
1439         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1440
1441         /* see eth_random_addr in the kernel */
1442         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1443         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1444
1445         return 0;
1446 }
1447
1448 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1449         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1450         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1451         struct ether_addr mac;
1452         int r;
1453
1454         if (!arg_private_network)
1455                 return 0;
1456
1457         if (!arg_network_veth)
1458                 return 0;
1459
1460         /* Use two different interface name prefixes depending whether
1461          * we are in bridge mode or not. */
1462         if (arg_network_bridge)
1463                 memcpy(iface_name, "vb-", 3);
1464         else
1465                 memcpy(iface_name, "ve-", 3);
1466         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1467
1468         r = get_mac(&mac);
1469         if (r < 0) {
1470                 log_error("Failed to generate predictable MAC address for host0");
1471                 return r;
1472         }
1473
1474         r = sd_rtnl_open(&rtnl, 0);
1475         if (r < 0) {
1476                 log_error("Failed to connect to netlink: %s", strerror(-r));
1477                 return r;
1478         }
1479
1480         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1481         if (r < 0) {
1482                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1483                 return r;
1484         }
1485
1486         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1487         if (r < 0) {
1488                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1489                 return r;
1490         }
1491
1492         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1493         if (r < 0) {
1494                 log_error("Failed to open netlink container: %s", strerror(-r));
1495                 return r;
1496         }
1497
1498         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1499         if (r < 0) {
1500                 log_error("Failed to open netlink container: %s", strerror(-r));
1501                 return r;
1502         }
1503
1504         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1505         if (r < 0) {
1506                 log_error("Failed to open netlink container: %s", strerror(-r));
1507                 return r;
1508         }
1509
1510         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1511         if (r < 0) {
1512                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1513                 return r;
1514         }
1515
1516         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1517         if (r < 0) {
1518                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1519                 return r;
1520         }
1521
1522         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1523         if (r < 0) {
1524                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1525                 return r;
1526         }
1527
1528         r = sd_rtnl_message_close_container(m);
1529         if (r < 0) {
1530                 log_error("Failed to close netlink container: %s", strerror(-r));
1531                 return r;
1532         }
1533
1534         r = sd_rtnl_message_close_container(m);
1535         if (r < 0) {
1536                 log_error("Failed to close netlink container: %s", strerror(-r));
1537                 return r;
1538         }
1539
1540         r = sd_rtnl_message_close_container(m);
1541         if (r < 0) {
1542                 log_error("Failed to close netlink container: %s", strerror(-r));
1543                 return r;
1544         }
1545
1546         r = sd_rtnl_call(rtnl, m, 0, NULL);
1547         if (r < 0) {
1548                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1549                 return r;
1550         }
1551
1552         return 0;
1553 }
1554
1555 static int setup_bridge(const char veth_name[]) {
1556         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1557         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1558         int r, bridge;
1559
1560         if (!arg_private_network)
1561                 return 0;
1562
1563         if (!arg_network_veth)
1564                 return 0;
1565
1566         if (!arg_network_bridge)
1567                 return 0;
1568
1569         bridge = (int) if_nametoindex(arg_network_bridge);
1570         if (bridge <= 0) {
1571                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1572                 return -errno;
1573         }
1574
1575         r = sd_rtnl_open(&rtnl, 0);
1576         if (r < 0) {
1577                 log_error("Failed to connect to netlink: %s", strerror(-r));
1578                 return r;
1579         }
1580
1581         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1582         if (r < 0) {
1583                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1584                 return r;
1585         }
1586
1587         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1588         if (r < 0) {
1589                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1590                 return r;
1591         }
1592
1593         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1594         if (r < 0) {
1595                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1596                 return r;
1597         }
1598
1599         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1600         if (r < 0) {
1601                 log_error("Failed to add netlink master field: %s", strerror(-r));
1602                 return r;
1603         }
1604
1605         r = sd_rtnl_call(rtnl, m, 0, NULL);
1606         if (r < 0) {
1607                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1608                 return r;
1609         }
1610
1611         return 0;
1612 }
1613
1614 static int parse_interface(struct udev *udev, const char *name) {
1615         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1616         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1617         int ifi;
1618
1619         ifi = (int) if_nametoindex(name);
1620         if (ifi <= 0) {
1621                 log_error("Failed to resolve interface %s: %m", name);
1622                 return -errno;
1623         }
1624
1625         sprintf(ifi_str, "n%i", ifi);
1626         d = udev_device_new_from_device_id(udev, ifi_str);
1627         if (!d) {
1628                 log_error("Failed to get udev device for interface %s: %m", name);
1629                 return -errno;
1630         }
1631
1632         if (udev_device_get_is_initialized(d) <= 0) {
1633                 log_error("Network interface %s is not initialized yet.", name);
1634                 return -EBUSY;
1635         }
1636
1637         return ifi;
1638 }
1639
1640 static int move_network_interfaces(pid_t pid) {
1641         _cleanup_udev_unref_ struct udev *udev = NULL;
1642         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1643         char **i;
1644         int r;
1645
1646         if (!arg_private_network)
1647                 return 0;
1648
1649         if (strv_isempty(arg_network_interfaces))
1650                 return 0;
1651
1652         r = sd_rtnl_open(&rtnl, 0);
1653         if (r < 0) {
1654                 log_error("Failed to connect to netlink: %s", strerror(-r));
1655                 return r;
1656         }
1657
1658         udev = udev_new();
1659         if (!udev) {
1660                 log_error("Failed to connect to udev.");
1661                 return -ENOMEM;
1662         }
1663
1664         STRV_FOREACH(i, arg_network_interfaces) {
1665                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1666                 int ifi;
1667
1668                 ifi = parse_interface(udev, *i);
1669                 if (ifi < 0)
1670                         return ifi;
1671
1672                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1673                 if (r < 0) {
1674                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1675                         return r;
1676                 }
1677
1678                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1679                 if (r < 0) {
1680                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1681                         return r;
1682                 }
1683
1684                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1685                 if (r < 0) {
1686                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1687                         return r;
1688                 }
1689         }
1690
1691         return 0;
1692 }
1693
1694 static int setup_macvlan(pid_t pid) {
1695         _cleanup_udev_unref_ struct udev *udev = NULL;
1696         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1697         char **i;
1698         int r;
1699
1700         if (!arg_private_network)
1701                 return 0;
1702
1703         if (strv_isempty(arg_network_macvlan))
1704                 return 0;
1705
1706         r = sd_rtnl_open(&rtnl, 0);
1707         if (r < 0) {
1708                 log_error("Failed to connect to netlink: %s", strerror(-r));
1709                 return r;
1710         }
1711
1712         udev = udev_new();
1713         if (!udev) {
1714                 log_error("Failed to connect to udev.");
1715                 return -ENOMEM;
1716         }
1717
1718         STRV_FOREACH(i, arg_network_macvlan) {
1719                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1720                 _cleanup_free_ char *n = NULL;
1721                 int ifi;
1722
1723                 ifi = parse_interface(udev, *i);
1724                 if (ifi < 0)
1725                         return ifi;
1726
1727                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1728                 if (r < 0) {
1729                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1730                         return r;
1731                 }
1732
1733                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1734                 if (r < 0) {
1735                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1736                         return r;
1737                 }
1738
1739                 n = strappend("mv-", *i);
1740                 if (!n)
1741                         return log_oom();
1742
1743                 strshorten(n, IFNAMSIZ-1);
1744
1745                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1746                 if (r < 0) {
1747                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1748                         return r;
1749                 }
1750
1751                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1752                 if (r < 0) {
1753                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1754                         return r;
1755                 }
1756
1757                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1758                 if (r < 0) {
1759                         log_error("Failed to open netlink container: %s", strerror(-r));
1760                         return r;
1761                 }
1762
1763                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1764                 if (r < 0) {
1765                         log_error("Failed to open netlink container: %s", strerror(-r));
1766                         return r;
1767                 }
1768
1769                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1770                 if (r < 0) {
1771                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1772                         return r;
1773                 }
1774
1775                 r = sd_rtnl_message_close_container(m);
1776                 if (r < 0) {
1777                         log_error("Failed to close netlink container: %s", strerror(-r));
1778                         return r;
1779                 }
1780
1781                 r = sd_rtnl_message_close_container(m);
1782                 if (r < 0) {
1783                         log_error("Failed to close netlink container: %s", strerror(-r));
1784                         return r;
1785                 }
1786
1787                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1788                 if (r < 0) {
1789                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1790                         return r;
1791                 }
1792         }
1793
1794         return 0;
1795 }
1796
1797 static int audit_still_doesnt_work_in_containers(void) {
1798
1799 #ifdef HAVE_SECCOMP
1800         scmp_filter_ctx seccomp;
1801         int r;
1802
1803         /*
1804            Audit is broken in containers, much of the userspace audit
1805            hookup will fail if running inside a container. We don't
1806            care and just turn off creation of audit sockets.
1807
1808            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1809            with EAFNOSUPPORT which audit userspace uses as indication
1810            that audit is disabled in the kernel.
1811          */
1812
1813         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1814         if (!seccomp)
1815                 return log_oom();
1816
1817         r = seccomp_add_secondary_archs(seccomp);
1818         if (r < 0) {
1819                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1820                 goto finish;
1821         }
1822
1823         r = seccomp_rule_add(
1824                         seccomp,
1825                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1826                         SCMP_SYS(socket),
1827                         2,
1828                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1829                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1830         if (r < 0) {
1831                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1832                 goto finish;
1833         }
1834
1835         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1836         if (r < 0) {
1837                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1838                 goto finish;
1839         }
1840
1841         r = seccomp_load(seccomp);
1842         if (r < 0)
1843                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1844
1845 finish:
1846         seccomp_release(seccomp);
1847         return r;
1848 #else
1849         return 0;
1850 #endif
1851
1852 }
1853
1854 static int setup_image(char **device_path, int *loop_nr) {
1855         struct loop_info64 info = {
1856                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1857         };
1858         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1859         _cleanup_free_ char* loopdev = NULL;
1860         struct stat st;
1861         int r, nr;
1862
1863         assert(device_path);
1864         assert(loop_nr);
1865
1866         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1867         if (fd < 0) {
1868                 log_error("Failed to open %s: %m", arg_image);
1869                 return -errno;
1870         }
1871
1872         if (fstat(fd, &st) < 0) {
1873                 log_error("Failed to stat %s: %m", arg_image);
1874                 return -errno;
1875         }
1876
1877         if (S_ISBLK(st.st_mode)) {
1878                 char *p;
1879
1880                 p = strdup(arg_image);
1881                 if (!p)
1882                         return log_oom();
1883
1884                 *device_path = p;
1885
1886                 *loop_nr = -1;
1887
1888                 r = fd;
1889                 fd = -1;
1890
1891                 return r;
1892         }
1893
1894         if (!S_ISREG(st.st_mode)) {
1895                 log_error("%s is not a regular file or block device: %m", arg_image);
1896                 return -EINVAL;
1897         }
1898
1899         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1900         if (control < 0) {
1901                 log_error("Failed to open /dev/loop-control: %m");
1902                 return -errno;
1903         }
1904
1905         nr = ioctl(control, LOOP_CTL_GET_FREE);
1906         if (nr < 0) {
1907                 log_error("Failed to allocate loop device: %m");
1908                 return -errno;
1909         }
1910
1911         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1912                 return log_oom();
1913
1914         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1915         if (loop < 0) {
1916                 log_error("Failed to open loop device %s: %m", loopdev);
1917                 return -errno;
1918         }
1919
1920         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1921                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1922                 return -errno;
1923         }
1924
1925         if (arg_read_only)
1926                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1927
1928         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1929                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1930                 return -errno;
1931         }
1932
1933         *device_path = loopdev;
1934         loopdev = NULL;
1935
1936         *loop_nr = nr;
1937
1938         r = loop;
1939         loop = -1;
1940
1941         return r;
1942 }
1943
1944 static int dissect_image(
1945                 int fd,
1946                 char **root_device, bool *root_device_rw,
1947                 char **home_device, bool *home_device_rw,
1948                 char **srv_device, bool *srv_device_rw,
1949                 bool *secondary) {
1950
1951 #ifdef HAVE_BLKID
1952         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1953         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1954         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1955         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1956         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1957         _cleanup_udev_unref_ struct udev *udev = NULL;
1958         struct udev_list_entry *first, *item;
1959         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1960         const char *pttype = NULL;
1961         blkid_partlist pl;
1962         struct stat st;
1963         int r;
1964
1965         assert(fd >= 0);
1966         assert(root_device);
1967         assert(home_device);
1968         assert(srv_device);
1969         assert(secondary);
1970
1971         b = blkid_new_probe();
1972         if (!b)
1973                 return log_oom();
1974
1975         errno = 0;
1976         r = blkid_probe_set_device(b, fd, 0, 0);
1977         if (r != 0) {
1978                 if (errno == 0)
1979                         return log_oom();
1980
1981                 log_error("Failed to set device on blkid probe: %m");
1982                 return -errno;
1983         }
1984
1985         blkid_probe_enable_partitions(b, 1);
1986         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1987
1988         errno = 0;
1989         r = blkid_do_safeprobe(b);
1990         if (r == -2 || r == 1) {
1991                 log_error("Failed to identify any partition table on %s.\n"
1992                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1993                 return -EINVAL;
1994         } else if (r != 0) {
1995                 if (errno == 0)
1996                         errno = EIO;
1997                 log_error("Failed to probe: %m");
1998                 return -errno;
1999         }
2000
2001         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2002         if (!streq_ptr(pttype, "gpt")) {
2003                 log_error("Image %s does not carry a GUID Partition Table.\n"
2004                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2005                 return -EINVAL;
2006         }
2007
2008         errno = 0;
2009         pl = blkid_probe_get_partitions(b);
2010         if (!pl) {
2011                 if (errno == 0)
2012                         return log_oom();
2013
2014                 log_error("Failed to list partitions of %s", arg_image);
2015                 return -errno;
2016         }
2017
2018         udev = udev_new();
2019         if (!udev)
2020                 return log_oom();
2021
2022         if (fstat(fd, &st) < 0) {
2023                 log_error("Failed to stat block device: %m");
2024                 return -errno;
2025         }
2026
2027         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2028         if (!d)
2029                 return log_oom();
2030
2031         e = udev_enumerate_new(udev);
2032         if (!e)
2033                 return log_oom();
2034
2035         r = udev_enumerate_add_match_parent(e, d);
2036         if (r < 0)
2037                 return log_oom();
2038
2039         r = udev_enumerate_scan_devices(e);
2040         if (r < 0) {
2041                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2042                 return r;
2043         }
2044
2045         first = udev_enumerate_get_list_entry(e);
2046         udev_list_entry_foreach(item, first) {
2047                 _cleanup_udev_device_unref_ struct udev_device *q;
2048                 const char *stype, *node;
2049                 unsigned long long flags;
2050                 sd_id128_t type_id;
2051                 blkid_partition pp;
2052                 dev_t qn;
2053                 int nr;
2054
2055                 errno = 0;
2056                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2057                 if (!q) {
2058                         if (!errno)
2059                                 errno = ENOMEM;
2060
2061                         log_error("Failed to get partition device of %s: %m", arg_image);
2062                         return -errno;
2063                 }
2064
2065                 qn = udev_device_get_devnum(q);
2066                 if (major(qn) == 0)
2067                         continue;
2068
2069                 if (st.st_rdev == qn)
2070                         continue;
2071
2072                 node = udev_device_get_devnode(q);
2073                 if (!node)
2074                         continue;
2075
2076                 pp = blkid_partlist_devno_to_partition(pl, qn);
2077                 if (!pp)
2078                         continue;
2079
2080                 flags = blkid_partition_get_flags(pp);
2081                 if (flags & GPT_FLAG_NO_AUTO)
2082                         continue;
2083
2084                 nr = blkid_partition_get_partno(pp);
2085                 if (nr < 0)
2086                         continue;
2087
2088                 stype = blkid_partition_get_type_string(pp);
2089                 if (!stype)
2090                         continue;
2091
2092                 if (sd_id128_from_string(stype, &type_id) < 0)
2093                         continue;
2094
2095                 if (sd_id128_equal(type_id, GPT_HOME)) {
2096
2097                         if (home && nr >= home_nr)
2098                                 continue;
2099
2100                         home_nr = nr;
2101                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2102
2103                         free(home);
2104                         home = strdup(node);
2105                         if (!home)
2106                                 return log_oom();
2107                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2108
2109                         if (srv && nr >= srv_nr)
2110                                 continue;
2111
2112                         srv_nr = nr;
2113                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2114
2115                         free(srv);
2116                         srv = strdup(node);
2117                         if (!srv)
2118                                 return log_oom();
2119                 }
2120 #ifdef GPT_ROOT_NATIVE
2121                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2122
2123                         if (root && nr >= root_nr)
2124                                 continue;
2125
2126                         root_nr = nr;
2127                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2128
2129                         free(root);
2130                         root = strdup(node);
2131                         if (!root)
2132                                 return log_oom();
2133                 }
2134 #endif
2135 #ifdef GPT_ROOT_SECONDARY
2136                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2137
2138                         if (secondary_root && nr >= secondary_root_nr)
2139                                 continue;
2140
2141                         secondary_root_nr = nr;
2142                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2143
2144
2145                         free(secondary_root);
2146                         secondary_root = strdup(node);
2147                         if (!secondary_root)
2148                                 return log_oom();
2149                 }
2150 #endif
2151         }
2152
2153         if (!root && !secondary_root) {
2154                 log_error("Failed to identify root partition in disk image %s.\n"
2155                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2156                 return -EINVAL;
2157         }
2158
2159         if (root) {
2160                 *root_device = root;
2161                 root = NULL;
2162
2163                 *root_device_rw = root_rw;
2164                 *secondary = false;
2165         } else if (secondary_root) {
2166                 *root_device = secondary_root;
2167                 secondary_root = NULL;
2168
2169                 *root_device_rw = secondary_root_rw;
2170                 *secondary = true;
2171         }
2172
2173         if (home) {
2174                 *home_device = home;
2175                 home = NULL;
2176
2177                 *home_device_rw = home_rw;
2178         }
2179
2180         if (srv) {
2181                 *srv_device = srv;
2182                 srv = NULL;
2183
2184                 *srv_device_rw = srv_rw;
2185         }
2186
2187         return 0;
2188 #else
2189         log_error("--image= is not supported, compiled without blkid support.");
2190         return -ENOTSUP;
2191 #endif
2192 }
2193
2194 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2195 #ifdef HAVE_BLKID
2196         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2197         const char *fstype, *p;
2198         int r;
2199
2200         assert(what);
2201         assert(where);
2202
2203         if (arg_read_only)
2204                 rw = false;
2205
2206         if (directory)
2207                 p = strappenda(where, directory);
2208         else
2209                 p = where;
2210
2211         errno = 0;
2212         b = blkid_new_probe_from_filename(what);
2213         if (!b) {
2214                 if (errno == 0)
2215                         return log_oom();
2216                 log_error("Failed to allocate prober for %s: %m", what);
2217                 return -errno;
2218         }
2219
2220         blkid_probe_enable_superblocks(b, 1);
2221         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2222
2223         errno = 0;
2224         r = blkid_do_safeprobe(b);
2225         if (r == -1 || r == 1) {
2226                 log_error("Cannot determine file system type of %s", what);
2227                 return -EINVAL;
2228         } else if (r != 0) {
2229                 if (errno == 0)
2230                         errno = EIO;
2231                 log_error("Failed to probe %s: %m", what);
2232                 return -errno;
2233         }
2234
2235         errno = 0;
2236         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2237                 if (errno == 0)
2238                         errno = EINVAL;
2239                 log_error("Failed to determine file system type of %s", what);
2240                 return -errno;
2241         }
2242
2243         if (streq(fstype, "crypto_LUKS")) {
2244                 log_error("nspawn currently does not support LUKS disk images.");
2245                 return -ENOTSUP;
2246         }
2247
2248         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2249                 log_error("Failed to mount %s: %m", what);
2250                 return -errno;
2251         }
2252
2253         return 0;
2254 #else
2255         log_error("--image= is not supported, compiled without blkid support.");
2256         return -ENOTSUP;
2257 #endif
2258 }
2259
2260 static int mount_devices(
2261                 const char *where,
2262                 const char *root_device, bool root_device_rw,
2263                 const char *home_device, bool home_device_rw,
2264                 const char *srv_device, bool srv_device_rw) {
2265         int r;
2266
2267         assert(where);
2268
2269         if (root_device) {
2270                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2271                 if (r < 0) {
2272                         log_error("Failed to mount root directory: %s", strerror(-r));
2273                         return r;
2274                 }
2275         }
2276
2277         if (home_device) {
2278                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2279                 if (r < 0) {
2280                         log_error("Failed to mount home directory: %s", strerror(-r));
2281                         return r;
2282                 }
2283         }
2284
2285         if (srv_device) {
2286                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2287                 if (r < 0) {
2288                         log_error("Failed to mount server data directory: %s", strerror(-r));
2289                         return r;
2290                 }
2291         }
2292
2293         return 0;
2294 }
2295
2296 static void loop_remove(int nr, int *image_fd) {
2297         _cleanup_close_ int control = -1;
2298
2299         if (nr < 0)
2300                 return;
2301
2302         if (image_fd && *image_fd >= 0) {
2303                 ioctl(*image_fd, LOOP_CLR_FD);
2304                 *image_fd = safe_close(*image_fd);
2305         }
2306
2307         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2308         if (control < 0)
2309                 return;
2310
2311         ioctl(control, LOOP_CTL_REMOVE, nr);
2312 }
2313
2314 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2315         int pipe_fds[2];
2316         pid_t pid;
2317
2318         assert(database);
2319         assert(key);
2320         assert(rpid);
2321
2322         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2323                 log_error("Failed to allocate pipe: %m");
2324                 return -errno;
2325         }
2326
2327         pid = fork();
2328         if (pid < 0) {
2329                 log_error("Failed to fork getent child: %m");
2330                 return -errno;
2331         } else if (pid == 0) {
2332                 int nullfd;
2333                 char *empty_env = NULL;
2334
2335                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2336                         _exit(EXIT_FAILURE);
2337
2338                 if (pipe_fds[0] > 2)
2339                         safe_close(pipe_fds[0]);
2340                 if (pipe_fds[1] > 2)
2341                         safe_close(pipe_fds[1]);
2342
2343                 nullfd = open("/dev/null", O_RDWR);
2344                 if (nullfd < 0)
2345                         _exit(EXIT_FAILURE);
2346
2347                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2348                         _exit(EXIT_FAILURE);
2349
2350                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2351                         _exit(EXIT_FAILURE);
2352
2353                 if (nullfd > 2)
2354                         safe_close(nullfd);
2355
2356                 reset_all_signal_handlers();
2357                 close_all_fds(NULL, 0);
2358
2359                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2360                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2361                 _exit(EXIT_FAILURE);
2362         }
2363
2364         pipe_fds[1] = safe_close(pipe_fds[1]);
2365
2366         *rpid = pid;
2367
2368         return pipe_fds[0];
2369 }
2370
2371 static int change_uid_gid(char **_home) {
2372         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2373         _cleanup_free_ uid_t *uids = NULL;
2374         _cleanup_free_ char *home = NULL;
2375         _cleanup_fclose_ FILE *f = NULL;
2376         _cleanup_close_ int fd = -1;
2377         unsigned n_uids = 0;
2378         size_t sz = 0, l;
2379         uid_t uid;
2380         gid_t gid;
2381         pid_t pid;
2382         int r;
2383
2384         assert(_home);
2385
2386         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2387                 /* Reset everything fully to 0, just in case */
2388
2389                 if (setgroups(0, NULL) < 0) {
2390                         log_error("setgroups() failed: %m");
2391                         return -errno;
2392                 }
2393
2394                 if (setresgid(0, 0, 0) < 0) {
2395                         log_error("setregid() failed: %m");
2396                         return -errno;
2397                 }
2398
2399                 if (setresuid(0, 0, 0) < 0) {
2400                         log_error("setreuid() failed: %m");
2401                         return -errno;
2402                 }
2403
2404                 *_home = NULL;
2405                 return 0;
2406         }
2407
2408         /* First, get user credentials */
2409         fd = spawn_getent("passwd", arg_user, &pid);
2410         if (fd < 0)
2411                 return fd;
2412
2413         f = fdopen(fd, "r");
2414         if (!f)
2415                 return log_oom();
2416         fd = -1;
2417
2418         if (!fgets(line, sizeof(line), f)) {
2419
2420                 if (!ferror(f)) {
2421                         log_error("Failed to resolve user %s.", arg_user);
2422                         return -ESRCH;
2423                 }
2424
2425                 log_error("Failed to read from getent: %m");
2426                 return -errno;
2427         }
2428
2429         truncate_nl(line);
2430
2431         wait_for_terminate_and_warn("getent passwd", pid);
2432
2433         x = strchr(line, ':');
2434         if (!x) {
2435                 log_error("/etc/passwd entry has invalid user field.");
2436                 return -EIO;
2437         }
2438
2439         u = strchr(x+1, ':');
2440         if (!u) {
2441                 log_error("/etc/passwd entry has invalid password field.");
2442                 return -EIO;
2443         }
2444
2445         u++;
2446         g = strchr(u, ':');
2447         if (!g) {
2448                 log_error("/etc/passwd entry has invalid UID field.");
2449                 return -EIO;
2450         }
2451
2452         *g = 0;
2453         g++;
2454         x = strchr(g, ':');
2455         if (!x) {
2456                 log_error("/etc/passwd entry has invalid GID field.");
2457                 return -EIO;
2458         }
2459
2460         *x = 0;
2461         h = strchr(x+1, ':');
2462         if (!h) {
2463                 log_error("/etc/passwd entry has invalid GECOS field.");
2464                 return -EIO;
2465         }
2466
2467         h++;
2468         x = strchr(h, ':');
2469         if (!x) {
2470                 log_error("/etc/passwd entry has invalid home directory field.");
2471                 return -EIO;
2472         }
2473
2474         *x = 0;
2475
2476         r = parse_uid(u, &uid);
2477         if (r < 0) {
2478                 log_error("Failed to parse UID of user.");
2479                 return -EIO;
2480         }
2481
2482         r = parse_gid(g, &gid);
2483         if (r < 0) {
2484                 log_error("Failed to parse GID of user.");
2485                 return -EIO;
2486         }
2487
2488         home = strdup(h);
2489         if (!home)
2490                 return log_oom();
2491
2492         /* Second, get group memberships */
2493         fd = spawn_getent("initgroups", arg_user, &pid);
2494         if (fd < 0)
2495                 return fd;
2496
2497         fclose(f);
2498         f = fdopen(fd, "r");
2499         if (!f)
2500                 return log_oom();
2501         fd = -1;
2502
2503         if (!fgets(line, sizeof(line), f)) {
2504                 if (!ferror(f)) {
2505                         log_error("Failed to resolve user %s.", arg_user);
2506                         return -ESRCH;
2507                 }
2508
2509                 log_error("Failed to read from getent: %m");
2510                 return -errno;
2511         }
2512
2513         truncate_nl(line);
2514
2515         wait_for_terminate_and_warn("getent initgroups", pid);
2516
2517         /* Skip over the username and subsequent separator whitespace */
2518         x = line;
2519         x += strcspn(x, WHITESPACE);
2520         x += strspn(x, WHITESPACE);
2521
2522         FOREACH_WORD(w, l, x, state) {
2523                 char c[l+1];
2524
2525                 memcpy(c, w, l);
2526                 c[l] = 0;
2527
2528                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2529                         return log_oom();
2530
2531                 r = parse_uid(c, &uids[n_uids++]);
2532                 if (r < 0) {
2533                         log_error("Failed to parse group data from getent.");
2534                         return -EIO;
2535                 }
2536         }
2537
2538         r = mkdir_parents(home, 0775);
2539         if (r < 0) {
2540                 log_error("Failed to make home root directory: %s", strerror(-r));
2541                 return r;
2542         }
2543
2544         r = mkdir_safe(home, 0755, uid, gid);
2545         if (r < 0 && r != -EEXIST) {
2546                 log_error("Failed to make home directory: %s", strerror(-r));
2547                 return r;
2548         }
2549
2550         fchown(STDIN_FILENO, uid, gid);
2551         fchown(STDOUT_FILENO, uid, gid);
2552         fchown(STDERR_FILENO, uid, gid);
2553
2554         if (setgroups(n_uids, uids) < 0) {
2555                 log_error("Failed to set auxiliary groups: %m");
2556                 return -errno;
2557         }
2558
2559         if (setresgid(gid, gid, gid) < 0) {
2560                 log_error("setregid() failed: %m");
2561                 return -errno;
2562         }
2563
2564         if (setresuid(uid, uid, uid) < 0) {
2565                 log_error("setreuid() failed: %m");
2566                 return -errno;
2567         }
2568
2569         if (_home) {
2570                 *_home = home;
2571                 home = NULL;
2572         }
2573
2574         return 0;
2575 }
2576
2577 /*
2578  * Return 0 in case the container is being rebooted, has been shut
2579  * down or exited successfully. On failures a negative value is
2580  * returned.
2581  *
2582  * The status of the container "CONTAINER_TERMINATED" or
2583  * "CONTAINER_REBOOTED" will be saved in the container argument
2584  */
2585 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2586         int r;
2587         siginfo_t status;
2588
2589         r = wait_for_terminate(pid, &status);
2590         if (r < 0)
2591                 return r;
2592
2593         switch (status.si_code) {
2594         case CLD_EXITED:
2595                 r = status.si_status;
2596                 if (r == 0) {
2597                         if (!arg_quiet)
2598                                 log_debug("Container %s exited successfully.",
2599                                           arg_machine);
2600
2601                         *container = CONTAINER_TERMINATED;
2602                 } else {
2603                         log_error("Container %s failed with error code %i.",
2604                                   arg_machine, status.si_status);
2605                         r = -1;
2606                 }
2607                 break;
2608
2609         case CLD_KILLED:
2610                 if (status.si_status == SIGINT) {
2611                         if (!arg_quiet)
2612                                 log_info("Container %s has been shut down.",
2613                                          arg_machine);
2614
2615                         *container = CONTAINER_TERMINATED;
2616                         r = 0;
2617                         break;
2618                 } else if (status.si_status == SIGHUP) {
2619                         if (!arg_quiet)
2620                                 log_info("Container %s is being rebooted.",
2621                                          arg_machine);
2622
2623                         *container = CONTAINER_REBOOTED;
2624                         r = 0;
2625                         break;
2626                 }
2627                 /* CLD_KILLED fallthrough */
2628
2629         case CLD_DUMPED:
2630                 log_error("Container %s terminated by signal %s.",
2631                           arg_machine, signal_to_string(status.si_status));
2632                 r = -1;
2633                 break;
2634
2635         default:
2636                 log_error("Container %s failed due to unknown reason.",
2637                           arg_machine);
2638                 r = -1;
2639                 break;
2640         }
2641
2642         return r;
2643 }
2644
2645 int main(int argc, char *argv[]) {
2646
2647         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2648         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2649         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2650         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2651         _cleanup_fdset_free_ FDSet *fds = NULL;
2652         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2653         const char *console = NULL;
2654         char veth_name[IFNAMSIZ];
2655         bool secondary = false;
2656         pid_t pid = 0;
2657         sigset_t mask;
2658
2659         log_parse_environment();
2660         log_open();
2661
2662         k = parse_argv(argc, argv);
2663         if (k < 0)
2664                 goto finish;
2665         else if (k == 0) {
2666                 r = EXIT_SUCCESS;
2667                 goto finish;
2668         }
2669
2670         if (!arg_image) {
2671                 if (arg_directory) {
2672                         char *p;
2673
2674                         p = path_make_absolute_cwd(arg_directory);
2675                         free(arg_directory);
2676                         arg_directory = p;
2677                 } else
2678                         arg_directory = get_current_dir_name();
2679
2680                 if (!arg_directory) {
2681                         log_error("Failed to determine path, please use -D.");
2682                         goto finish;
2683                 }
2684                 path_kill_slashes(arg_directory);
2685         }
2686
2687         if (!arg_machine) {
2688                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2689                 if (!arg_machine) {
2690                         log_oom();
2691                         goto finish;
2692                 }
2693
2694                 hostname_cleanup(arg_machine, false);
2695                 if (isempty(arg_machine)) {
2696                         log_error("Failed to determine machine name automatically, please use -M.");
2697                         goto finish;
2698                 }
2699         }
2700
2701         if (geteuid() != 0) {
2702                 log_error("Need to be root.");
2703                 goto finish;
2704         }
2705
2706         if (sd_booted() <= 0) {
2707                 log_error("Not running on a systemd system.");
2708                 goto finish;
2709         }
2710
2711         log_close();
2712         n_fd_passed = sd_listen_fds(false);
2713         if (n_fd_passed > 0) {
2714                 k = fdset_new_listen_fds(&fds, false);
2715                 if (k < 0) {
2716                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2717                         goto finish;
2718                 }
2719         }
2720         fdset_close_others(fds);
2721         log_open();
2722
2723         if (arg_directory) {
2724                 if (path_equal(arg_directory, "/")) {
2725                         log_error("Spawning container on root directory not supported.");
2726                         goto finish;
2727                 }
2728
2729                 if (arg_boot) {
2730                         if (path_is_os_tree(arg_directory) <= 0) {
2731                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2732                                 goto finish;
2733                         }
2734                 } else {
2735                         const char *p;
2736
2737                         p = strappenda(arg_directory,
2738                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2739                         if (access(p, F_OK) < 0) {
2740                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2741                                 goto finish;
2742
2743                         }
2744                 }
2745         } else {
2746                 char template[] = "/tmp/nspawn-root-XXXXXX";
2747
2748                 if (!mkdtemp(template)) {
2749                         log_error("Failed to create temporary directory: %m");
2750                         r = -errno;
2751                         goto finish;
2752                 }
2753
2754                 arg_directory = strdup(template);
2755                 if (!arg_directory) {
2756                         r = log_oom();
2757                         goto finish;
2758                 }
2759
2760                 image_fd = setup_image(&device_path, &loop_nr);
2761                 if (image_fd < 0) {
2762                         r = image_fd;
2763                         goto finish;
2764                 }
2765
2766                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2767                 if (r < 0)
2768                         goto finish;
2769         }
2770
2771         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2772         if (master < 0) {
2773                 log_error("Failed to acquire pseudo tty: %m");
2774                 goto finish;
2775         }
2776
2777         console = ptsname(master);
2778         if (!console) {
2779                 log_error("Failed to determine tty name: %m");
2780                 goto finish;
2781         }
2782
2783         if (!arg_quiet)
2784                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2785
2786         if (unlockpt(master) < 0) {
2787                 log_error("Failed to unlock tty: %m");
2788                 goto finish;
2789         }
2790
2791         if (access("/dev/kdbus/control", F_OK) >= 0) {
2792
2793                 if (arg_share_system) {
2794                         kdbus_domain = strdup("/dev/kdbus");
2795                         if (!kdbus_domain) {
2796                                 log_oom();
2797                                 goto finish;
2798                         }
2799                 } else {
2800                         const char *ns;
2801
2802                         ns = strappenda("machine-", arg_machine);
2803                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2804                         if (r < 0)
2805                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2806                         else
2807                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2808                 }
2809         }
2810
2811         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2812                 log_error("Failed to create kmsg socket pair: %m");
2813                 goto finish;
2814         }
2815
2816         sd_notify(0, "READY=1");
2817
2818         assert_se(sigemptyset(&mask) == 0);
2819         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2820         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2821
2822         for (;;) {
2823                 ContainerStatus container_status;
2824                 int parent_ready_fd = -1, child_ready_fd = -1;
2825                 eventfd_t x;
2826
2827                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2828                 if (parent_ready_fd < 0) {
2829                         log_error("Failed to create event fd: %m");
2830                         goto finish;
2831                 }
2832
2833                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2834                 if (child_ready_fd < 0) {
2835                         log_error("Failed to create event fd: %m");
2836                         goto finish;
2837                 }
2838
2839                 pid = syscall(__NR_clone,
2840                               SIGCHLD|CLONE_NEWNS|
2841                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2842                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2843                 if (pid < 0) {
2844                         if (errno == EINVAL)
2845                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2846                         else
2847                                 log_error("clone() failed: %m");
2848
2849                         goto finish;
2850                 }
2851
2852                 if (pid == 0) {
2853                         /* child */
2854                         _cleanup_free_ char *home = NULL;
2855                         unsigned n_env = 2;
2856                         const char *envp[] = {
2857                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2858                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2859                                 NULL, /* TERM */
2860                                 NULL, /* HOME */
2861                                 NULL, /* USER */
2862                                 NULL, /* LOGNAME */
2863                                 NULL, /* container_uuid */
2864                                 NULL, /* LISTEN_FDS */
2865                                 NULL, /* LISTEN_PID */
2866                                 NULL
2867                         };
2868                         char **env_use;
2869
2870                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2871                         if (envp[n_env])
2872                                 n_env ++;
2873
2874                         master = safe_close(master);
2875
2876                         close_nointr(STDIN_FILENO);
2877                         close_nointr(STDOUT_FILENO);
2878                         close_nointr(STDERR_FILENO);
2879
2880                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2881
2882                         reset_all_signal_handlers();
2883
2884                         assert_se(sigemptyset(&mask) == 0);
2885                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2886
2887                         k = open_terminal(console, O_RDWR);
2888                         if (k != STDIN_FILENO) {
2889                                 if (k >= 0) {
2890                                         safe_close(k);
2891                                         k = -EINVAL;
2892                                 }
2893
2894                                 log_error("Failed to open console: %s", strerror(-k));
2895                                 goto child_fail;
2896                         }
2897
2898                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2899                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2900                                 log_error("Failed to duplicate console: %m");
2901                                 goto child_fail;
2902                         }
2903
2904                         if (setsid() < 0) {
2905                                 log_error("setsid() failed: %m");
2906                                 goto child_fail;
2907                         }
2908
2909                         if (reset_audit_loginuid() < 0)
2910                                 goto child_fail;
2911
2912                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2913                                 log_error("PR_SET_PDEATHSIG failed: %m");
2914                                 goto child_fail;
2915                         }
2916
2917                         /* Mark everything as slave, so that we still
2918                          * receive mounts from the real root, but don't
2919                          * propagate mounts to the real root. */
2920                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2921                                 log_error("MS_SLAVE|MS_REC failed: %m");
2922                                 goto child_fail;
2923                         }
2924
2925                         if (mount_devices(arg_directory,
2926                                           root_device, root_device_rw,
2927                                           home_device, home_device_rw,
2928                                           srv_device, srv_device_rw) < 0)
2929                                 goto child_fail;
2930
2931                         /* Turn directory into bind mount */
2932                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2933                                 log_error("Failed to make bind mount.");
2934                                 goto child_fail;
2935                         }
2936
2937                         if (arg_read_only)
2938                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2939                                         log_error("Failed to make read-only.");
2940                                         goto child_fail;
2941                                 }
2942
2943                         if (mount_all(arg_directory) < 0)
2944                                 goto child_fail;
2945
2946                         if (copy_devnodes(arg_directory) < 0)
2947                                 goto child_fail;
2948
2949                         if (setup_ptmx(arg_directory) < 0)
2950                                 goto child_fail;
2951
2952                         dev_setup(arg_directory);
2953
2954                         if (audit_still_doesnt_work_in_containers() < 0)
2955                                 goto child_fail;
2956
2957                         if (setup_dev_console(arg_directory, console) < 0)
2958                                 goto child_fail;
2959
2960                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2961                                 goto child_fail;
2962
2963                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2964
2965                         if (setup_boot_id(arg_directory) < 0)
2966                                 goto child_fail;
2967
2968                         if (setup_timezone(arg_directory) < 0)
2969                                 goto child_fail;
2970
2971                         if (setup_resolv_conf(arg_directory) < 0)
2972                                 goto child_fail;
2973
2974                         if (setup_journal(arg_directory) < 0)
2975                                 goto child_fail;
2976
2977                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2978                                 goto child_fail;
2979
2980                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2981                                 goto child_fail;
2982
2983                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2984                                 goto child_fail;
2985
2986                         /* Tell the parent that we are ready, and that
2987                          * it can cgroupify us to that we lack access
2988                          * to certain devices and resources. */
2989                         eventfd_write(child_ready_fd, 1);
2990                         child_ready_fd = safe_close(child_ready_fd);
2991
2992                         if (chdir(arg_directory) < 0) {
2993                                 log_error("chdir(%s) failed: %m", arg_directory);
2994                                 goto child_fail;
2995                         }
2996
2997                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2998                                 log_error("mount(MS_MOVE) failed: %m");
2999                                 goto child_fail;
3000                         }
3001
3002                         if (chroot(".") < 0) {
3003                                 log_error("chroot() failed: %m");
3004                                 goto child_fail;
3005                         }
3006
3007                         if (chdir("/") < 0) {
3008                                 log_error("chdir() failed: %m");
3009                                 goto child_fail;
3010                         }
3011
3012                         umask(0022);
3013
3014                         if (arg_private_network)
3015                                 loopback_setup();
3016
3017                         if (drop_capabilities() < 0) {
3018                                 log_error("drop_capabilities() failed: %m");
3019                                 goto child_fail;
3020                         }
3021
3022                         r = change_uid_gid(&home);
3023                         if (r < 0)
3024                                 goto child_fail;
3025
3026                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3027                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3028                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3029                                 log_oom();
3030                                 goto child_fail;
3031                         }
3032
3033                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3034                                 char as_uuid[37];
3035
3036                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3037                                         log_oom();
3038                                         goto child_fail;
3039                                 }
3040                         }
3041
3042                         if (fdset_size(fds) > 0) {
3043                                 k = fdset_cloexec(fds, false);
3044                                 if (k < 0) {
3045                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3046                                         goto child_fail;
3047                                 }
3048
3049                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3050                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3051                                         log_oom();
3052                                         goto child_fail;
3053                                 }
3054                         }
3055
3056                         setup_hostname();
3057
3058                         if (arg_personality != 0xffffffffLU) {
3059                                 if (personality(arg_personality) < 0) {
3060                                         log_error("personality() failed: %m");
3061                                         goto child_fail;
3062                                 }
3063                         } else if (secondary) {
3064                                 if (personality(PER_LINUX32) < 0) {
3065                                         log_error("personality() failed: %m");
3066                                         goto child_fail;
3067                                 }
3068                         }
3069
3070 #ifdef HAVE_SELINUX
3071                         if (arg_selinux_context)
3072                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3073                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3074                                         goto child_fail;
3075                                 }
3076 #endif
3077
3078                         if (!strv_isempty(arg_setenv)) {
3079                                 char **n;
3080
3081                                 n = strv_env_merge(2, envp, arg_setenv);
3082                                 if (!n) {
3083                                         log_oom();
3084                                         goto child_fail;
3085                                 }
3086
3087                                 env_use = n;
3088                         } else
3089                                 env_use = (char**) envp;
3090
3091                         /* Wait until the parent is ready with the setup, too... */
3092                         eventfd_read(parent_ready_fd, &x);
3093                         parent_ready_fd = safe_close(parent_ready_fd);
3094
3095                         if (arg_boot) {
3096                                 char **a;
3097                                 size_t l;
3098
3099                                 /* Automatically search for the init system */
3100
3101                                 l = 1 + argc - optind;
3102                                 a = newa(char*, l + 1);
3103                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3104
3105                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3106                                 execve(a[0], a, env_use);
3107
3108                                 a[0] = (char*) "/lib/systemd/systemd";
3109                                 execve(a[0], a, env_use);
3110
3111                                 a[0] = (char*) "/sbin/init";
3112                                 execve(a[0], a, env_use);
3113                         } else if (argc > optind)
3114                                 execvpe(argv[optind], argv + optind, env_use);
3115                         else {
3116                                 chdir(home ? home : "/root");
3117                                 execle("/bin/bash", "-bash", NULL, env_use);
3118                                 execle("/bin/sh", "-sh", NULL, env_use);
3119                         }
3120
3121                         log_error("execv() failed: %m");
3122
3123                 child_fail:
3124                         _exit(EXIT_FAILURE);
3125                 }
3126
3127                 fdset_free(fds);
3128                 fds = NULL;
3129
3130                 /* Wait until the child reported that it is ready with
3131                  * all it needs to do with privileges. After we got
3132                  * the notification we can make the process join its
3133                  * cgroup which might limit what it can do */
3134                 eventfd_read(child_ready_fd, &x);
3135
3136                 r = register_machine(pid);
3137                 if (r < 0)
3138                         goto finish;
3139
3140                 r = move_network_interfaces(pid);
3141                 if (r < 0)
3142                         goto finish;
3143
3144                 r = setup_veth(pid, veth_name);
3145                 if (r < 0)
3146                         goto finish;
3147
3148                 r = setup_bridge(veth_name);
3149                 if (r < 0)
3150                         goto finish;
3151
3152                 r = setup_macvlan(pid);
3153                 if (r < 0)
3154                         goto finish;
3155
3156                 /* Notify the child that the parent is ready with all
3157                  * its setup, and thtat the child can now hand over
3158                  * control to the code to run inside the container. */
3159                 eventfd_write(parent_ready_fd, 1);
3160
3161                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3162                 if (k < 0) {
3163                         r = EXIT_FAILURE;
3164                         break;
3165                 }
3166
3167                 if (!arg_quiet)
3168                         putc('\n', stdout);
3169
3170                 /* Kill if it is not dead yet anyway */
3171                 terminate_machine(pid);
3172
3173                 /* Redundant, but better safe than sorry */
3174                 kill(pid, SIGKILL);
3175
3176                 r = wait_for_container(pid, &container_status);
3177                 pid = 0;
3178
3179                 if (r < 0) {
3180                         r = EXIT_FAILURE;
3181                         break;
3182                 } else if (container_status == CONTAINER_TERMINATED)
3183                         break;
3184
3185                 /* CONTAINER_REBOOTED, loop again */
3186         }
3187
3188 finish:
3189         loop_remove(loop_nr, &image_fd);
3190
3191         if (pid > 0)
3192                 kill(pid, SIGKILL);
3193
3194         free(arg_directory);
3195         free(arg_machine);
3196         free(arg_user);
3197         strv_free(arg_setenv);
3198         strv_free(arg_network_interfaces);
3199         strv_free(arg_network_macvlan);
3200         strv_free(arg_bind);
3201         strv_free(arg_bind_ro);
3202
3203         return r;
3204 }