chiark / gitweb /
501bccae873e33f95da179220a6135bf67b8bf49
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91 #include "copy.h"
92 #include "base-filesystem.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 static char *arg_directory = NULL;
111 static char *arg_user = NULL;
112 static sd_id128_t arg_uuid = {};
113 static char *arg_machine = NULL;
114 static const char *arg_selinux_context = NULL;
115 static const char *arg_selinux_apifs_context = NULL;
116 static const char *arg_slice = NULL;
117 static bool arg_private_network = false;
118 static bool arg_read_only = false;
119 static bool arg_boot = false;
120 static LinkJournal arg_link_journal = LINK_AUTO;
121 static uint64_t arg_retain =
122         (1ULL << CAP_CHOWN) |
123         (1ULL << CAP_DAC_OVERRIDE) |
124         (1ULL << CAP_DAC_READ_SEARCH) |
125         (1ULL << CAP_FOWNER) |
126         (1ULL << CAP_FSETID) |
127         (1ULL << CAP_IPC_OWNER) |
128         (1ULL << CAP_KILL) |
129         (1ULL << CAP_LEASE) |
130         (1ULL << CAP_LINUX_IMMUTABLE) |
131         (1ULL << CAP_NET_BIND_SERVICE) |
132         (1ULL << CAP_NET_BROADCAST) |
133         (1ULL << CAP_NET_RAW) |
134         (1ULL << CAP_SETGID) |
135         (1ULL << CAP_SETFCAP) |
136         (1ULL << CAP_SETPCAP) |
137         (1ULL << CAP_SETUID) |
138         (1ULL << CAP_SYS_ADMIN) |
139         (1ULL << CAP_SYS_CHROOT) |
140         (1ULL << CAP_SYS_NICE) |
141         (1ULL << CAP_SYS_PTRACE) |
142         (1ULL << CAP_SYS_TTY_CONFIG) |
143         (1ULL << CAP_SYS_RESOURCE) |
144         (1ULL << CAP_SYS_BOOT) |
145         (1ULL << CAP_AUDIT_WRITE) |
146         (1ULL << CAP_AUDIT_CONTROL) |
147         (1ULL << CAP_MKNOD);
148 static char **arg_bind = NULL;
149 static char **arg_bind_ro = NULL;
150 static char **arg_tmpfs = NULL;
151 static char **arg_setenv = NULL;
152 static bool arg_quiet = false;
153 static bool arg_share_system = false;
154 static bool arg_register = true;
155 static bool arg_keep_unit = false;
156 static char **arg_network_interfaces = NULL;
157 static char **arg_network_macvlan = NULL;
158 static bool arg_network_veth = false;
159 static const char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = 0xffffffffLU;
161 static const char *arg_image = NULL;
162
163 static int help(void) {
164
165         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
166                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
167                "  -h --help                 Show this help\n"
168                "     --version              Print version string\n"
169                "  -q --quiet                Do not show status information\n"
170                "  -D --directory=PATH       Root directory for the container\n"
171                "  -i --image=PATH           File system device or image for the container\n"
172                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
173                "  -u --user=USER            Run the command under specified user or uid\n"
174                "  -M --machine=NAME         Set the machine name for the container\n"
175                "     --uuid=UUID            Set a specific machine UUID for the container\n"
176                "  -S --slice=SLICE          Place the container in the specified slice\n"
177                "     --private-network      Disable network in container\n"
178                "     --network-interface=INTERFACE\n"
179                "                            Assign an existing network interface to the\n"
180                "                            container\n"
181                "     --network-macvlan=INTERFACE\n"
182                "                            Create a macvlan network interface based on an\n"
183                "                            existing network interface to the container\n"
184                "     --network-veth         Add a virtual ethernet connection between host\n"
185                "                            and container\n"
186                "     --network-bridge=INTERFACE\n"
187                "                            Add a virtual ethernet connection between host\n"
188                "                            and container and add it to an existing bridge on\n"
189                "                            the host\n"
190                "  -Z --selinux-context=SECLABEL\n"
191                "                            Set the SELinux security context to be used by\n"
192                "                            processes in the container\n"
193                "  -L --selinux-apifs-context=SECLABEL\n"
194                "                            Set the SELinux security context to be used by\n"
195                "                            API/tmpfs file systems in the container\n"
196                "     --capability=CAP       In addition to the default, retain specified\n"
197                "                            capability\n"
198                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
199                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
200                "  -j                        Equivalent to --link-journal=host\n"
201                "     --read-only            Mount the root directory read-only\n"
202                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
203                "                            the container\n"
204                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
205                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
206                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
207                "     --share-system         Share system namespaces with host\n"
208                "     --register=BOOLEAN     Register container as machine\n"
209                "     --keep-unit            Do not register a scope for the machine, reuse\n"
210                "                            the service unit nspawn is running in\n",
211                program_invocation_short_name);
212
213         return 0;
214 }
215
216 static int parse_argv(int argc, char *argv[]) {
217
218         enum {
219                 ARG_VERSION = 0x100,
220                 ARG_PRIVATE_NETWORK,
221                 ARG_UUID,
222                 ARG_READ_ONLY,
223                 ARG_CAPABILITY,
224                 ARG_DROP_CAPABILITY,
225                 ARG_LINK_JOURNAL,
226                 ARG_BIND,
227                 ARG_BIND_RO,
228                 ARG_TMPFS,
229                 ARG_SETENV,
230                 ARG_SHARE_SYSTEM,
231                 ARG_REGISTER,
232                 ARG_KEEP_UNIT,
233                 ARG_NETWORK_INTERFACE,
234                 ARG_NETWORK_MACVLAN,
235                 ARG_NETWORK_VETH,
236                 ARG_NETWORK_BRIDGE,
237                 ARG_PERSONALITY,
238         };
239
240         static const struct option options[] = {
241                 { "help",                  no_argument,       NULL, 'h'                   },
242                 { "version",               no_argument,       NULL, ARG_VERSION           },
243                 { "directory",             required_argument, NULL, 'D'                   },
244                 { "user",                  required_argument, NULL, 'u'                   },
245                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
246                 { "boot",                  no_argument,       NULL, 'b'                   },
247                 { "uuid",                  required_argument, NULL, ARG_UUID              },
248                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
249                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
250                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
251                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
252                 { "bind",                  required_argument, NULL, ARG_BIND              },
253                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
254                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
255                 { "machine",               required_argument, NULL, 'M'                   },
256                 { "slice",                 required_argument, NULL, 'S'                   },
257                 { "setenv",                required_argument, NULL, ARG_SETENV            },
258                 { "selinux-context",       required_argument, NULL, 'Z'                   },
259                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
260                 { "quiet",                 no_argument,       NULL, 'q'                   },
261                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
262                 { "register",              required_argument, NULL, ARG_REGISTER          },
263                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
264                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
265                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
266                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
267                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
268                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
269                 { "image",                 required_argument, NULL, 'i'                   },
270                 {}
271         };
272
273         int c, r;
274         uint64_t plus = 0, minus = 0;
275
276         assert(argc >= 0);
277         assert(argv);
278
279         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
280
281                 switch (c) {
282
283                 case 'h':
284                         return help();
285
286                 case ARG_VERSION:
287                         puts(PACKAGE_STRING);
288                         puts(SYSTEMD_FEATURES);
289                         return 0;
290
291                 case 'D':
292                         free(arg_directory);
293                         arg_directory = canonicalize_file_name(optarg);
294                         if (!arg_directory) {
295                                 log_error("Invalid root directory: %m");
296                                 return -ENOMEM;
297                         }
298
299                         break;
300
301                 case 'i':
302                         arg_image = optarg;
303                         break;
304
305                 case 'u':
306                         free(arg_user);
307                         arg_user = strdup(optarg);
308                         if (!arg_user)
309                                 return log_oom();
310
311                         break;
312
313                 case ARG_NETWORK_BRIDGE:
314                         arg_network_bridge = optarg;
315
316                         /* fall through */
317
318                 case ARG_NETWORK_VETH:
319                         arg_network_veth = true;
320                         arg_private_network = true;
321                         break;
322
323                 case ARG_NETWORK_INTERFACE:
324                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
325                                 return log_oom();
326
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_MACVLAN:
331                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
332                                 return log_oom();
333
334                         /* fall through */
335
336                 case ARG_PRIVATE_NETWORK:
337                         arg_private_network = true;
338                         break;
339
340                 case 'b':
341                         arg_boot = true;
342                         break;
343
344                 case ARG_UUID:
345                         r = sd_id128_from_string(optarg, &arg_uuid);
346                         if (r < 0) {
347                                 log_error("Invalid UUID: %s", optarg);
348                                 return r;
349                         }
350                         break;
351
352                 case 'S':
353                         arg_slice = optarg;
354                         break;
355
356                 case 'M':
357                         if (isempty(optarg)) {
358                                 free(arg_machine);
359                                 arg_machine = NULL;
360                         } else {
361
362                                 if (!hostname_is_valid(optarg)) {
363                                         log_error("Invalid machine name: %s", optarg);
364                                         return -EINVAL;
365                                 }
366
367                                 free(arg_machine);
368                                 arg_machine = strdup(optarg);
369                                 if (!arg_machine)
370                                         return log_oom();
371
372                                 break;
373                         }
374
375                 case 'Z':
376                         arg_selinux_context = optarg;
377                         break;
378
379                 case 'L':
380                         arg_selinux_apifs_context = optarg;
381                         break;
382
383                 case ARG_READ_ONLY:
384                         arg_read_only = true;
385                         break;
386
387                 case ARG_CAPABILITY:
388                 case ARG_DROP_CAPABILITY: {
389                         char *state, *word;
390                         size_t length;
391
392                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
393                                 _cleanup_free_ char *t;
394                                 cap_value_t cap;
395
396                                 t = strndup(word, length);
397                                 if (!t)
398                                         return log_oom();
399
400                                 if (streq(t, "all")) {
401                                         if (c == ARG_CAPABILITY)
402                                                 plus = (uint64_t) -1;
403                                         else
404                                                 minus = (uint64_t) -1;
405                                 } else {
406                                         if (cap_from_name(t, &cap) < 0) {
407                                                 log_error("Failed to parse capability %s.", t);
408                                                 return -EINVAL;
409                                         }
410
411                                         if (c == ARG_CAPABILITY)
412                                                 plus |= 1ULL << (uint64_t) cap;
413                                         else
414                                                 minus |= 1ULL << (uint64_t) cap;
415                                 }
416                         }
417
418                         break;
419                 }
420
421                 case 'j':
422                         arg_link_journal = LINK_GUEST;
423                         break;
424
425                 case ARG_LINK_JOURNAL:
426                         if (streq(optarg, "auto"))
427                                 arg_link_journal = LINK_AUTO;
428                         else if (streq(optarg, "no"))
429                                 arg_link_journal = LINK_NO;
430                         else if (streq(optarg, "guest"))
431                                 arg_link_journal = LINK_GUEST;
432                         else if (streq(optarg, "host"))
433                                 arg_link_journal = LINK_HOST;
434                         else {
435                                 log_error("Failed to parse link journal mode %s", optarg);
436                                 return -EINVAL;
437                         }
438
439                         break;
440
441                 case ARG_BIND:
442                 case ARG_BIND_RO: {
443                         _cleanup_free_ char *a = NULL, *b = NULL;
444                         char *e;
445                         char ***x;
446
447                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
448
449                         e = strchr(optarg, ':');
450                         if (e) {
451                                 a = strndup(optarg, e - optarg);
452                                 b = strdup(e + 1);
453                         } else {
454                                 a = strdup(optarg);
455                                 b = strdup(optarg);
456                         }
457
458                         if (!a || !b)
459                                 return log_oom();
460
461                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
462                                 log_error("Invalid bind mount specification: %s", optarg);
463                                 return -EINVAL;
464                         }
465
466                         r = strv_extend(x, a);
467                         if (r < 0)
468                                 return log_oom();
469
470                         r = strv_extend(x, b);
471                         if (r < 0)
472                                 return log_oom();
473
474                         break;
475                 }
476
477                 case ARG_TMPFS: {
478                         _cleanup_free_ char *a = NULL, *b = NULL;
479                         char *e;
480
481                         e = strchr(optarg, ':');
482                         if (e) {
483                                 a = strndup(optarg, e - optarg);
484                                 b = strdup(e + 1);
485                         } else {
486                                 a = strdup(optarg);
487                                 b = strdup("mode=0755");
488                         }
489
490                         if (!a || !b)
491                                 return log_oom();
492
493                         if (!path_is_absolute(a)) {
494                                 log_error("Invalid tmpfs specification: %s", optarg);
495                                 return -EINVAL;
496                         }
497
498                         r = strv_push(&arg_tmpfs, a);
499                         if (r < 0)
500                                 return log_oom();
501
502                         a = NULL;
503
504                         r = strv_push(&arg_tmpfs, b);
505                         if (r < 0)
506                                 return log_oom();
507
508                         b = NULL;
509
510                         break;
511                 }
512
513                 case ARG_SETENV: {
514                         char **n;
515
516                         if (!env_assignment_is_valid(optarg)) {
517                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
518                                 return -EINVAL;
519                         }
520
521                         n = strv_env_set(arg_setenv, optarg);
522                         if (!n)
523                                 return log_oom();
524
525                         strv_free(arg_setenv);
526                         arg_setenv = n;
527                         break;
528                 }
529
530                 case 'q':
531                         arg_quiet = true;
532                         break;
533
534                 case ARG_SHARE_SYSTEM:
535                         arg_share_system = true;
536                         break;
537
538                 case ARG_REGISTER:
539                         r = parse_boolean(optarg);
540                         if (r < 0) {
541                                 log_error("Failed to parse --register= argument: %s", optarg);
542                                 return r;
543                         }
544
545                         arg_register = r;
546                         break;
547
548                 case ARG_KEEP_UNIT:
549                         arg_keep_unit = true;
550                         break;
551
552                 case ARG_PERSONALITY:
553
554                         arg_personality = personality_from_string(optarg);
555                         if (arg_personality == 0xffffffffLU) {
556                                 log_error("Unknown or unsupported personality '%s'.", optarg);
557                                 return -EINVAL;
558                         }
559
560                         break;
561
562                 case '?':
563                         return -EINVAL;
564
565                 default:
566                         assert_not_reached("Unhandled option");
567                 }
568         }
569
570         if (arg_share_system)
571                 arg_register = false;
572
573         if (arg_boot && arg_share_system) {
574                 log_error("--boot and --share-system may not be combined.");
575                 return -EINVAL;
576         }
577
578         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
579                 log_error("--keep-unit may not be used when invoked from a user session.");
580                 return -EINVAL;
581         }
582
583         if (arg_directory && arg_image) {
584                 log_error("--directory= and --image= may not be combined.");
585                 return -EINVAL;
586         }
587
588         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
589
590         return 1;
591 }
592
593 static int mount_all(const char *dest) {
594
595         typedef struct MountPoint {
596                 const char *what;
597                 const char *where;
598                 const char *type;
599                 const char *options;
600                 unsigned long flags;
601                 bool fatal;
602         } MountPoint;
603
604         static const MountPoint mount_table[] = {
605                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
606                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
607                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
608                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
609                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
610                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
611                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
612                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
613 #ifdef HAVE_SELINUX
614                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
615                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
616 #endif
617         };
618
619         unsigned k;
620         int r = 0;
621
622         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
623                 _cleanup_free_ char *where = NULL;
624 #ifdef HAVE_SELINUX
625                 _cleanup_free_ char *options = NULL;
626 #endif
627                 const char *o;
628                 int t;
629
630                 where = strjoin(dest, "/", mount_table[k].where, NULL);
631                 if (!where)
632                         return log_oom();
633
634                 t = path_is_mount_point(where, true);
635                 if (t < 0) {
636                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
637
638                         if (r == 0)
639                                 r = t;
640
641                         continue;
642                 }
643
644                 /* Skip this entry if it is not a remount. */
645                 if (mount_table[k].what && t > 0)
646                         continue;
647
648                 mkdir_p(where, 0755);
649
650 #ifdef HAVE_SELINUX
651                 if (arg_selinux_apifs_context &&
652                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
653                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
654                         if (!options)
655                                 return log_oom();
656
657                         o = options;
658                 } else
659 #endif
660                         o = mount_table[k].options;
661
662
663                 if (mount(mount_table[k].what,
664                           where,
665                           mount_table[k].type,
666                           mount_table[k].flags,
667                           o) < 0 &&
668                     mount_table[k].fatal) {
669
670                         log_error("mount(%s) failed: %m", where);
671
672                         if (r == 0)
673                                 r = -errno;
674                 }
675         }
676
677         return r;
678 }
679
680 static int mount_binds(const char *dest, char **l, bool ro) {
681         char **x, **y;
682
683         STRV_FOREACH_PAIR(x, y, l) {
684                 _cleanup_free_ char *where = NULL;
685                 struct stat source_st, dest_st;
686                 int r;
687
688                 if (stat(*x, &source_st) < 0) {
689                         log_error("Failed to stat %s: %m", *x);
690                         return -errno;
691                 }
692
693                 where = strappend(dest, *y);
694                 if (!where)
695                         return log_oom();
696
697                 r = stat(where, &dest_st);
698                 if (r == 0) {
699                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
700                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
701                                 return -EINVAL;
702                         }
703                 } else if (errno == ENOENT) {
704                         r = mkdir_parents_label(where, 0755);
705                         if (r < 0) {
706                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
707                                 return r;
708                         }
709                 } else {
710                         log_error("Failed to bind mount %s: %m", *x);
711                         return -errno;
712                 }
713
714                 /* Create the mount point, but be conservative -- refuse to create block
715                 * and char devices. */
716                 if (S_ISDIR(source_st.st_mode))
717                         mkdir_label(where, 0755);
718                 else if (S_ISFIFO(source_st.st_mode))
719                         mkfifo(where, 0644);
720                 else if (S_ISSOCK(source_st.st_mode))
721                         mknod(where, 0644 | S_IFSOCK, 0);
722                 else if (S_ISREG(source_st.st_mode))
723                         touch(where);
724                 else {
725                         log_error("Refusing to create mountpoint for file: %s", *x);
726                         return -ENOTSUP;
727                 }
728
729                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
730                         log_error("mount(%s) failed: %m", where);
731                         return -errno;
732                 }
733
734                 if (ro) {
735                         r = bind_remount_recursive(where, true);
736                         if (r < 0) {
737                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
738                                 return r;
739                         }
740                 }
741         }
742
743         return 0;
744 }
745
746 static int mount_tmpfs(const char *dest) {
747         char **i, **o;
748
749         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
750                 _cleanup_free_ char *where = NULL;
751
752                 where = strappend(dest, *i);
753                 if (!where)
754                         return log_oom();
755
756                 mkdir_label(where, 0755);
757
758                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
759                         log_error("tmpfs mount to %s failed: %m", where);
760                         return -errno;
761                 }
762         }
763
764         return 0;
765 }
766
767 static int setup_timezone(const char *dest) {
768         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
769         char *z, *y;
770         int r;
771
772         assert(dest);
773
774         /* Fix the timezone, if possible */
775         r = readlink_malloc("/etc/localtime", &p);
776         if (r < 0) {
777                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
778                 return 0;
779         }
780
781         z = path_startswith(p, "../usr/share/zoneinfo/");
782         if (!z)
783                 z = path_startswith(p, "/usr/share/zoneinfo/");
784         if (!z) {
785                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
786                 return 0;
787         }
788
789         where = strappend(dest, "/etc/localtime");
790         if (!where)
791                 return log_oom();
792
793         r = readlink_malloc(where, &q);
794         if (r >= 0) {
795                 y = path_startswith(q, "../usr/share/zoneinfo/");
796                 if (!y)
797                         y = path_startswith(q, "/usr/share/zoneinfo/");
798
799
800                 /* Already pointing to the right place? Then do nothing .. */
801                 if (y && streq(y, z))
802                         return 0;
803         }
804
805         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
806         if (!check)
807                 return log_oom();
808
809         if (access(check, F_OK) < 0) {
810                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
811                 return 0;
812         }
813
814         what = strappend("../usr/share/zoneinfo/", z);
815         if (!what)
816                 return log_oom();
817
818         unlink(where);
819         if (symlink(what, where) < 0) {
820                 log_error("Failed to correct timezone of container: %m");
821                 return 0;
822         }
823
824         return 0;
825 }
826
827 static int setup_resolv_conf(const char *dest) {
828         _cleanup_free_ char *where = NULL;
829
830         assert(dest);
831
832         if (arg_private_network)
833                 return 0;
834
835         /* Fix resolv.conf, if possible */
836         where = strappend(dest, "/etc/resolv.conf");
837         if (!where)
838                 return log_oom();
839
840         /* We don't really care for the results of this really. If it
841          * fails, it fails, but meh... */
842         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
843
844         return 0;
845 }
846
847 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
848
849         snprintf(s, 37,
850                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
851                  SD_ID128_FORMAT_VAL(id));
852
853         return s;
854 }
855
856 static int setup_boot_id(const char *dest) {
857         _cleanup_free_ char *from = NULL, *to = NULL;
858         sd_id128_t rnd = {};
859         char as_uuid[37];
860         int r;
861
862         assert(dest);
863
864         if (arg_share_system)
865                 return 0;
866
867         /* Generate a new randomized boot ID, so that each boot-up of
868          * the container gets a new one */
869
870         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
871         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
872         if (!from || !to)
873                 return log_oom();
874
875         r = sd_id128_randomize(&rnd);
876         if (r < 0) {
877                 log_error("Failed to generate random boot id: %s", strerror(-r));
878                 return r;
879         }
880
881         id128_format_as_uuid(rnd, as_uuid);
882
883         r = write_string_file(from, as_uuid);
884         if (r < 0) {
885                 log_error("Failed to write boot id: %s", strerror(-r));
886                 return r;
887         }
888
889         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
890                 log_error("Failed to bind mount boot id: %m");
891                 r = -errno;
892         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
893                 log_warning("Failed to make boot id read-only: %m");
894
895         unlink(from);
896         return r;
897 }
898
899 static int copy_devnodes(const char *dest) {
900
901         static const char devnodes[] =
902                 "null\0"
903                 "zero\0"
904                 "full\0"
905                 "random\0"
906                 "urandom\0"
907                 "tty\0";
908
909         const char *d;
910         int r = 0;
911         _cleanup_umask_ mode_t u;
912
913         assert(dest);
914
915         u = umask(0000);
916
917         NULSTR_FOREACH(d, devnodes) {
918                 _cleanup_free_ char *from = NULL, *to = NULL;
919                 struct stat st;
920
921                 from = strappend("/dev/", d);
922                 to = strjoin(dest, "/dev/", d, NULL);
923                 if (!from || !to)
924                         return log_oom();
925
926                 if (stat(from, &st) < 0) {
927
928                         if (errno != ENOENT) {
929                                 log_error("Failed to stat %s: %m", from);
930                                 return -errno;
931                         }
932
933                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
934
935                         log_error("%s is not a char or block device, cannot copy", from);
936                         return -EIO;
937
938                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
939
940                         log_error("mknod(%s) failed: %m", dest);
941                         return  -errno;
942                 }
943         }
944
945         return r;
946 }
947
948 static int setup_ptmx(const char *dest) {
949         _cleanup_free_ char *p = NULL;
950
951         p = strappend(dest, "/dev/ptmx");
952         if (!p)
953                 return log_oom();
954
955         if (symlink("pts/ptmx", p) < 0) {
956                 log_error("Failed to create /dev/ptmx symlink: %m");
957                 return -errno;
958         }
959
960         return 0;
961 }
962
963 static int setup_dev_console(const char *dest, const char *console) {
964         _cleanup_umask_ mode_t u;
965         const char *to;
966         struct stat st;
967         int r;
968
969         assert(dest);
970         assert(console);
971
972         u = umask(0000);
973
974         if (stat("/dev/null", &st) < 0) {
975                 log_error("Failed to stat /dev/null: %m");
976                 return -errno;
977         }
978
979         r = chmod_and_chown(console, 0600, 0, 0);
980         if (r < 0) {
981                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
982                 return r;
983         }
984
985         /* We need to bind mount the right tty to /dev/console since
986          * ptys can only exist on pts file systems. To have something
987          * to bind mount things on we create a device node first, and
988          * use /dev/null for that since we the cgroups device policy
989          * allows us to create that freely, while we cannot create
990          * /dev/console. (Note that the major minor doesn't actually
991          * matter here, since we mount it over anyway). */
992
993         to = strappenda(dest, "/dev/console");
994         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
995                 log_error("mknod() for /dev/console failed: %m");
996                 return -errno;
997         }
998
999         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1000                 log_error("Bind mount for /dev/console failed: %m");
1001                 return -errno;
1002         }
1003
1004         return 0;
1005 }
1006
1007 static int setup_kmsg(const char *dest, int kmsg_socket) {
1008         _cleanup_free_ char *from = NULL, *to = NULL;
1009         int r, fd, k;
1010         _cleanup_umask_ mode_t u;
1011         union {
1012                 struct cmsghdr cmsghdr;
1013                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1014         } control = {};
1015         struct msghdr mh = {
1016                 .msg_control = &control,
1017                 .msg_controllen = sizeof(control),
1018         };
1019         struct cmsghdr *cmsg;
1020
1021         assert(dest);
1022         assert(kmsg_socket >= 0);
1023
1024         u = umask(0000);
1025
1026         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1027          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1028          * on the reading side behave very similar to /proc/kmsg,
1029          * their writing side behaves differently from /dev/kmsg in
1030          * that writing blocks when nothing is reading. In order to
1031          * avoid any problems with containers deadlocking due to this
1032          * we simply make /dev/kmsg unavailable to the container. */
1033         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1034             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1035                 return log_oom();
1036
1037         if (mkfifo(from, 0600) < 0) {
1038                 log_error("mkfifo() for /dev/kmsg failed: %m");
1039                 return -errno;
1040         }
1041
1042         r = chmod_and_chown(from, 0600, 0, 0);
1043         if (r < 0) {
1044                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1045                 return r;
1046         }
1047
1048         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1049                 log_error("Bind mount for /proc/kmsg failed: %m");
1050                 return -errno;
1051         }
1052
1053         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1054         if (fd < 0) {
1055                 log_error("Failed to open fifo: %m");
1056                 return -errno;
1057         }
1058
1059         cmsg = CMSG_FIRSTHDR(&mh);
1060         cmsg->cmsg_level = SOL_SOCKET;
1061         cmsg->cmsg_type = SCM_RIGHTS;
1062         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1063         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1064
1065         mh.msg_controllen = cmsg->cmsg_len;
1066
1067         /* Store away the fd in the socket, so that it stays open as
1068          * long as we run the child */
1069         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1070         safe_close(fd);
1071
1072         if (k < 0) {
1073                 log_error("Failed to send FIFO fd: %m");
1074                 return -errno;
1075         }
1076
1077         /* And now make the FIFO unavailable as /dev/kmsg... */
1078         unlink(from);
1079         return 0;
1080 }
1081
1082 static int setup_hostname(void) {
1083
1084         if (arg_share_system)
1085                 return 0;
1086
1087         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1088                 return -errno;
1089
1090         return 0;
1091 }
1092
1093 static int setup_journal(const char *directory) {
1094         sd_id128_t machine_id, this_id;
1095         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1096         char *id;
1097         int r;
1098
1099         p = strappend(directory, "/etc/machine-id");
1100         if (!p)
1101                 return log_oom();
1102
1103         r = read_one_line_file(p, &b);
1104         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1105                 return 0;
1106         else if (r < 0) {
1107                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1108                 return r;
1109         }
1110
1111         id = strstrip(b);
1112         if (isempty(id) && arg_link_journal == LINK_AUTO)
1113                 return 0;
1114
1115         /* Verify validity */
1116         r = sd_id128_from_string(id, &machine_id);
1117         if (r < 0) {
1118                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1119                 return r;
1120         }
1121
1122         r = sd_id128_get_machine(&this_id);
1123         if (r < 0) {
1124                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1125                 return r;
1126         }
1127
1128         if (sd_id128_equal(machine_id, this_id)) {
1129                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1130                          "Host and machine ids are equal (%s): refusing to link journals", id);
1131                 if (arg_link_journal == LINK_AUTO)
1132                         return 0;
1133                 return
1134                         -EEXIST;
1135         }
1136
1137         if (arg_link_journal == LINK_NO)
1138                 return 0;
1139
1140         free(p);
1141         p = strappend("/var/log/journal/", id);
1142         q = strjoin(directory, "/var/log/journal/", id, NULL);
1143         if (!p || !q)
1144                 return log_oom();
1145
1146         if (path_is_mount_point(p, false) > 0) {
1147                 if (arg_link_journal != LINK_AUTO) {
1148                         log_error("%s: already a mount point, refusing to use for journal", p);
1149                         return -EEXIST;
1150                 }
1151
1152                 return 0;
1153         }
1154
1155         if (path_is_mount_point(q, false) > 0) {
1156                 if (arg_link_journal != LINK_AUTO) {
1157                         log_error("%s: already a mount point, refusing to use for journal", q);
1158                         return -EEXIST;
1159                 }
1160
1161                 return 0;
1162         }
1163
1164         r = readlink_and_make_absolute(p, &d);
1165         if (r >= 0) {
1166                 if ((arg_link_journal == LINK_GUEST ||
1167                      arg_link_journal == LINK_AUTO) &&
1168                     path_equal(d, q)) {
1169
1170                         r = mkdir_p(q, 0755);
1171                         if (r < 0)
1172                                 log_warning("failed to create directory %s: %m", q);
1173                         return 0;
1174                 }
1175
1176                 if (unlink(p) < 0) {
1177                         log_error("Failed to remove symlink %s: %m", p);
1178                         return -errno;
1179                 }
1180         } else if (r == -EINVAL) {
1181
1182                 if (arg_link_journal == LINK_GUEST &&
1183                     rmdir(p) < 0) {
1184
1185                         if (errno == ENOTDIR) {
1186                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1187                                 return r;
1188                         } else {
1189                                 log_error("Failed to remove %s: %m", p);
1190                                 return -errno;
1191                         }
1192                 }
1193         } else if (r != -ENOENT) {
1194                 log_error("readlink(%s) failed: %m", p);
1195                 return r;
1196         }
1197
1198         if (arg_link_journal == LINK_GUEST) {
1199
1200                 if (symlink(q, p) < 0) {
1201                         log_error("Failed to symlink %s to %s: %m", q, p);
1202                         return -errno;
1203                 }
1204
1205                 r = mkdir_p(q, 0755);
1206                 if (r < 0)
1207                         log_warning("failed to create directory %s: %m", q);
1208                 return 0;
1209         }
1210
1211         if (arg_link_journal == LINK_HOST) {
1212                 r = mkdir_p(p, 0755);
1213                 if (r < 0) {
1214                         log_error("Failed to create %s: %m", p);
1215                         return r;
1216                 }
1217
1218         } else if (access(p, F_OK) < 0)
1219                 return 0;
1220
1221         if (dir_is_empty(q) == 0)
1222                 log_warning("%s is not empty, proceeding anyway.", q);
1223
1224         r = mkdir_p(q, 0755);
1225         if (r < 0) {
1226                 log_error("Failed to create %s: %m", q);
1227                 return r;
1228         }
1229
1230         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1231                 log_error("Failed to bind mount journal from host into guest: %m");
1232                 return -errno;
1233         }
1234
1235         return 0;
1236 }
1237
1238 static int setup_kdbus(const char *dest, const char *path) {
1239         const char *p;
1240
1241         if (!path)
1242                 return 0;
1243
1244         p = strappenda(dest, "/dev/kdbus");
1245         if (mkdir(p, 0755) < 0) {
1246                 log_error("Failed to create kdbus path: %m");
1247                 return  -errno;
1248         }
1249
1250         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1251                 log_error("Failed to mount kdbus domain path: %m");
1252                 return -errno;
1253         }
1254
1255         return 0;
1256 }
1257
1258 static int drop_capabilities(void) {
1259         return capability_bounding_set_drop(~arg_retain, false);
1260 }
1261
1262 static int register_machine(pid_t pid) {
1263         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1264         _cleanup_bus_unref_ sd_bus *bus = NULL;
1265         int r;
1266
1267         if (!arg_register)
1268                 return 0;
1269
1270         r = sd_bus_default_system(&bus);
1271         if (r < 0) {
1272                 log_error("Failed to open system bus: %s", strerror(-r));
1273                 return r;
1274         }
1275
1276         if (arg_keep_unit) {
1277                 r = sd_bus_call_method(
1278                                 bus,
1279                                 "org.freedesktop.machine1",
1280                                 "/org/freedesktop/machine1",
1281                                 "org.freedesktop.machine1.Manager",
1282                                 "RegisterMachine",
1283                                 &error,
1284                                 NULL,
1285                                 "sayssus",
1286                                 arg_machine,
1287                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1288                                 "nspawn",
1289                                 "container",
1290                                 (uint32_t) pid,
1291                                 strempty(arg_directory));
1292         } else {
1293                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1294
1295                 r = sd_bus_message_new_method_call(
1296                                 bus,
1297                                 &m,
1298                                 "org.freedesktop.machine1",
1299                                 "/org/freedesktop/machine1",
1300                                 "org.freedesktop.machine1.Manager",
1301                                 "CreateMachine");
1302                 if (r < 0) {
1303                         log_error("Failed to create message: %s", strerror(-r));
1304                         return r;
1305                 }
1306
1307                 r = sd_bus_message_append(
1308                                 m,
1309                                 "sayssus",
1310                                 arg_machine,
1311                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1312                                 "nspawn",
1313                                 "container",
1314                                 (uint32_t) pid,
1315                                 strempty(arg_directory));
1316                 if (r < 0) {
1317                         log_error("Failed to append message arguments: %s", strerror(-r));
1318                         return r;
1319                 }
1320
1321                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1322                 if (r < 0) {
1323                         log_error("Failed to open container: %s", strerror(-r));
1324                         return r;
1325                 }
1326
1327                 if (!isempty(arg_slice)) {
1328                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1329                         if (r < 0) {
1330                                 log_error("Failed to append slice: %s", strerror(-r));
1331                                 return r;
1332                         }
1333                 }
1334
1335                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1336                 if (r < 0) {
1337                         log_error("Failed to add device policy: %s", strerror(-r));
1338                         return r;
1339                 }
1340
1341                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1342                                           /* Allow the container to
1343                                            * access and create the API
1344                                            * device nodes, so that
1345                                            * PrivateDevices= in the
1346                                            * container can work
1347                                            * fine */
1348                                           "/dev/null", "rwm",
1349                                           "/dev/zero", "rwm",
1350                                           "/dev/full", "rwm",
1351                                           "/dev/random", "rwm",
1352                                           "/dev/urandom", "rwm",
1353                                           "/dev/tty", "rwm",
1354                                           /* Allow the container
1355                                            * access to ptys. However,
1356                                            * do not permit the
1357                                            * container to ever create
1358                                            * these device nodes. */
1359                                           "/dev/pts/ptmx", "rw",
1360                                           "char-pts", "rw",
1361                                           /* Allow the container
1362                                            * access to all kdbus
1363                                            * devices. Again, the
1364                                            * container cannot create
1365                                            * these nodes, only use
1366                                            * them. We use a pretty
1367                                            * open match here, so that
1368                                            * the kernel API can still
1369                                            * change. */
1370                                           "char-kdbus", "rw",
1371                                           "char-kdbus/*", "rw");
1372                 if (r < 0) {
1373                         log_error("Failed to add device whitelist: %s", strerror(-r));
1374                         return r;
1375                 }
1376
1377                 r = sd_bus_message_close_container(m);
1378                 if (r < 0) {
1379                         log_error("Failed to close container: %s", strerror(-r));
1380                         return r;
1381                 }
1382
1383                 r = sd_bus_call(bus, m, 0, &error, NULL);
1384         }
1385
1386         if (r < 0) {
1387                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1388                 return r;
1389         }
1390
1391         return 0;
1392 }
1393
1394 static int terminate_machine(pid_t pid) {
1395         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1396         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1397         _cleanup_bus_unref_ sd_bus *bus = NULL;
1398         const char *path;
1399         int r;
1400
1401         if (!arg_register)
1402                 return 0;
1403
1404         r = sd_bus_default_system(&bus);
1405         if (r < 0) {
1406                 log_error("Failed to open system bus: %s", strerror(-r));
1407                 return r;
1408         }
1409
1410         r = sd_bus_call_method(
1411                         bus,
1412                         "org.freedesktop.machine1",
1413                         "/org/freedesktop/machine1",
1414                         "org.freedesktop.machine1.Manager",
1415                         "GetMachineByPID",
1416                         &error,
1417                         &reply,
1418                         "u",
1419                         (uint32_t) pid);
1420         if (r < 0) {
1421                 /* Note that the machine might already have been
1422                  * cleaned up automatically, hence don't consider it a
1423                  * failure if we cannot get the machine object. */
1424                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1425                 return 0;
1426         }
1427
1428         r = sd_bus_message_read(reply, "o", &path);
1429         if (r < 0)
1430                 return bus_log_parse_error(r);
1431
1432         r = sd_bus_call_method(
1433                         bus,
1434                         "org.freedesktop.machine1",
1435                         path,
1436                         "org.freedesktop.machine1.Machine",
1437                         "Terminate",
1438                         &error,
1439                         NULL,
1440                         NULL);
1441         if (r < 0) {
1442                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1443                 return 0;
1444         }
1445
1446         return 0;
1447 }
1448
1449 static int reset_audit_loginuid(void) {
1450         _cleanup_free_ char *p = NULL;
1451         int r;
1452
1453         if (arg_share_system)
1454                 return 0;
1455
1456         r = read_one_line_file("/proc/self/loginuid", &p);
1457         if (r == -ENOENT)
1458                 return 0;
1459         if (r < 0) {
1460                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1461                 return r;
1462         }
1463
1464         /* Already reset? */
1465         if (streq(p, "4294967295"))
1466                 return 0;
1467
1468         r = write_string_file("/proc/self/loginuid", "4294967295");
1469         if (r < 0) {
1470                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1471                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1472                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1473                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1474                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1475
1476                 sleep(5);
1477         }
1478
1479         return 0;
1480 }
1481
1482 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1483
1484 static int get_mac(struct ether_addr *mac) {
1485         int r;
1486
1487         uint8_t result[8];
1488         size_t l, sz;
1489         uint8_t *v;
1490
1491         l = strlen(arg_machine);
1492         sz = sizeof(sd_id128_t) + l;
1493         v = alloca(sz);
1494
1495         /* fetch some persistent data unique to the host */
1496         r = sd_id128_get_machine((sd_id128_t*) v);
1497         if (r < 0)
1498                 return r;
1499
1500         /* combine with some data unique (on this host) to this
1501          * container instance */
1502         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1503
1504         /* Let's hash the host machine ID plus the container name. We
1505          * use a fixed, but originally randomly created hash key here. */
1506         siphash24(result, v, sz, HASH_KEY.bytes);
1507
1508         assert_cc(ETH_ALEN <= sizeof(result));
1509         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1510
1511         /* see eth_random_addr in the kernel */
1512         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1513         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1514
1515         return 0;
1516 }
1517
1518 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1519         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1520         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1521         struct ether_addr mac;
1522         int r;
1523
1524         if (!arg_private_network)
1525                 return 0;
1526
1527         if (!arg_network_veth)
1528                 return 0;
1529
1530         /* Use two different interface name prefixes depending whether
1531          * we are in bridge mode or not. */
1532         if (arg_network_bridge)
1533                 memcpy(iface_name, "vb-", 3);
1534         else
1535                 memcpy(iface_name, "ve-", 3);
1536         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1537
1538         r = get_mac(&mac);
1539         if (r < 0) {
1540                 log_error("Failed to generate predictable MAC address for host0");
1541                 return r;
1542         }
1543
1544         r = sd_rtnl_open(&rtnl, 0);
1545         if (r < 0) {
1546                 log_error("Failed to connect to netlink: %s", strerror(-r));
1547                 return r;
1548         }
1549
1550         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1551         if (r < 0) {
1552                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1553                 return r;
1554         }
1555
1556         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1557         if (r < 0) {
1558                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1559                 return r;
1560         }
1561
1562         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1563         if (r < 0) {
1564                 log_error("Failed to open netlink container: %s", strerror(-r));
1565                 return r;
1566         }
1567
1568         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1569         if (r < 0) {
1570                 log_error("Failed to open netlink container: %s", strerror(-r));
1571                 return r;
1572         }
1573
1574         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1575         if (r < 0) {
1576                 log_error("Failed to open netlink container: %s", strerror(-r));
1577                 return r;
1578         }
1579
1580         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1581         if (r < 0) {
1582                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1583                 return r;
1584         }
1585
1586         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1587         if (r < 0) {
1588                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1589                 return r;
1590         }
1591
1592         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1593         if (r < 0) {
1594                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1595                 return r;
1596         }
1597
1598         r = sd_rtnl_message_close_container(m);
1599         if (r < 0) {
1600                 log_error("Failed to close netlink container: %s", strerror(-r));
1601                 return r;
1602         }
1603
1604         r = sd_rtnl_message_close_container(m);
1605         if (r < 0) {
1606                 log_error("Failed to close netlink container: %s", strerror(-r));
1607                 return r;
1608         }
1609
1610         r = sd_rtnl_message_close_container(m);
1611         if (r < 0) {
1612                 log_error("Failed to close netlink container: %s", strerror(-r));
1613                 return r;
1614         }
1615
1616         r = sd_rtnl_call(rtnl, m, 0, NULL);
1617         if (r < 0) {
1618                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1619                 return r;
1620         }
1621
1622         return 0;
1623 }
1624
1625 static int setup_bridge(const char veth_name[]) {
1626         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1627         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1628         int r, bridge;
1629
1630         if (!arg_private_network)
1631                 return 0;
1632
1633         if (!arg_network_veth)
1634                 return 0;
1635
1636         if (!arg_network_bridge)
1637                 return 0;
1638
1639         bridge = (int) if_nametoindex(arg_network_bridge);
1640         if (bridge <= 0) {
1641                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1642                 return -errno;
1643         }
1644
1645         r = sd_rtnl_open(&rtnl, 0);
1646         if (r < 0) {
1647                 log_error("Failed to connect to netlink: %s", strerror(-r));
1648                 return r;
1649         }
1650
1651         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1652         if (r < 0) {
1653                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1654                 return r;
1655         }
1656
1657         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1658         if (r < 0) {
1659                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1660                 return r;
1661         }
1662
1663         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1664         if (r < 0) {
1665                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1666                 return r;
1667         }
1668
1669         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1670         if (r < 0) {
1671                 log_error("Failed to add netlink master field: %s", strerror(-r));
1672                 return r;
1673         }
1674
1675         r = sd_rtnl_call(rtnl, m, 0, NULL);
1676         if (r < 0) {
1677                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1678                 return r;
1679         }
1680
1681         return 0;
1682 }
1683
1684 static int parse_interface(struct udev *udev, const char *name) {
1685         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1686         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1687         int ifi;
1688
1689         ifi = (int) if_nametoindex(name);
1690         if (ifi <= 0) {
1691                 log_error("Failed to resolve interface %s: %m", name);
1692                 return -errno;
1693         }
1694
1695         sprintf(ifi_str, "n%i", ifi);
1696         d = udev_device_new_from_device_id(udev, ifi_str);
1697         if (!d) {
1698                 log_error("Failed to get udev device for interface %s: %m", name);
1699                 return -errno;
1700         }
1701
1702         if (udev_device_get_is_initialized(d) <= 0) {
1703                 log_error("Network interface %s is not initialized yet.", name);
1704                 return -EBUSY;
1705         }
1706
1707         return ifi;
1708 }
1709
1710 static int move_network_interfaces(pid_t pid) {
1711         _cleanup_udev_unref_ struct udev *udev = NULL;
1712         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1713         char **i;
1714         int r;
1715
1716         if (!arg_private_network)
1717                 return 0;
1718
1719         if (strv_isempty(arg_network_interfaces))
1720                 return 0;
1721
1722         r = sd_rtnl_open(&rtnl, 0);
1723         if (r < 0) {
1724                 log_error("Failed to connect to netlink: %s", strerror(-r));
1725                 return r;
1726         }
1727
1728         udev = udev_new();
1729         if (!udev) {
1730                 log_error("Failed to connect to udev.");
1731                 return -ENOMEM;
1732         }
1733
1734         STRV_FOREACH(i, arg_network_interfaces) {
1735                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1736                 int ifi;
1737
1738                 ifi = parse_interface(udev, *i);
1739                 if (ifi < 0)
1740                         return ifi;
1741
1742                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1743                 if (r < 0) {
1744                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1745                         return r;
1746                 }
1747
1748                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1749                 if (r < 0) {
1750                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1751                         return r;
1752                 }
1753
1754                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1755                 if (r < 0) {
1756                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1757                         return r;
1758                 }
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int setup_macvlan(pid_t pid) {
1765         _cleanup_udev_unref_ struct udev *udev = NULL;
1766         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1767         char **i;
1768         int r;
1769
1770         if (!arg_private_network)
1771                 return 0;
1772
1773         if (strv_isempty(arg_network_macvlan))
1774                 return 0;
1775
1776         r = sd_rtnl_open(&rtnl, 0);
1777         if (r < 0) {
1778                 log_error("Failed to connect to netlink: %s", strerror(-r));
1779                 return r;
1780         }
1781
1782         udev = udev_new();
1783         if (!udev) {
1784                 log_error("Failed to connect to udev.");
1785                 return -ENOMEM;
1786         }
1787
1788         STRV_FOREACH(i, arg_network_macvlan) {
1789                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1790                 _cleanup_free_ char *n = NULL;
1791                 int ifi;
1792
1793                 ifi = parse_interface(udev, *i);
1794                 if (ifi < 0)
1795                         return ifi;
1796
1797                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1798                 if (r < 0) {
1799                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1800                         return r;
1801                 }
1802
1803                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1804                 if (r < 0) {
1805                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1806                         return r;
1807                 }
1808
1809                 n = strappend("mv-", *i);
1810                 if (!n)
1811                         return log_oom();
1812
1813                 strshorten(n, IFNAMSIZ-1);
1814
1815                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1816                 if (r < 0) {
1817                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1818                         return r;
1819                 }
1820
1821                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1822                 if (r < 0) {
1823                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1824                         return r;
1825                 }
1826
1827                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1828                 if (r < 0) {
1829                         log_error("Failed to open netlink container: %s", strerror(-r));
1830                         return r;
1831                 }
1832
1833                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1834                 if (r < 0) {
1835                         log_error("Failed to open netlink container: %s", strerror(-r));
1836                         return r;
1837                 }
1838
1839                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1840                 if (r < 0) {
1841                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1842                         return r;
1843                 }
1844
1845                 r = sd_rtnl_message_close_container(m);
1846                 if (r < 0) {
1847                         log_error("Failed to close netlink container: %s", strerror(-r));
1848                         return r;
1849                 }
1850
1851                 r = sd_rtnl_message_close_container(m);
1852                 if (r < 0) {
1853                         log_error("Failed to close netlink container: %s", strerror(-r));
1854                         return r;
1855                 }
1856
1857                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1858                 if (r < 0) {
1859                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1860                         return r;
1861                 }
1862         }
1863
1864         return 0;
1865 }
1866
1867 static int audit_still_doesnt_work_in_containers(void) {
1868
1869 #ifdef HAVE_SECCOMP
1870         scmp_filter_ctx seccomp;
1871         int r;
1872
1873         /*
1874            Audit is broken in containers, much of the userspace audit
1875            hookup will fail if running inside a container. We don't
1876            care and just turn off creation of audit sockets.
1877
1878            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1879            with EAFNOSUPPORT which audit userspace uses as indication
1880            that audit is disabled in the kernel.
1881          */
1882
1883         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1884         if (!seccomp)
1885                 return log_oom();
1886
1887         r = seccomp_add_secondary_archs(seccomp);
1888         if (r < 0) {
1889                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1890                 goto finish;
1891         }
1892
1893         r = seccomp_rule_add(
1894                         seccomp,
1895                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1896                         SCMP_SYS(socket),
1897                         2,
1898                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1899                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1900         if (r < 0) {
1901                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1902                 goto finish;
1903         }
1904
1905         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1906         if (r < 0) {
1907                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1908                 goto finish;
1909         }
1910
1911         r = seccomp_load(seccomp);
1912         if (r < 0)
1913                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1914
1915 finish:
1916         seccomp_release(seccomp);
1917         return r;
1918 #else
1919         return 0;
1920 #endif
1921
1922 }
1923
1924 static int setup_image(char **device_path, int *loop_nr) {
1925         struct loop_info64 info = {
1926                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1927         };
1928         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1929         _cleanup_free_ char* loopdev = NULL;
1930         struct stat st;
1931         int r, nr;
1932
1933         assert(device_path);
1934         assert(loop_nr);
1935
1936         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1937         if (fd < 0) {
1938                 log_error("Failed to open %s: %m", arg_image);
1939                 return -errno;
1940         }
1941
1942         if (fstat(fd, &st) < 0) {
1943                 log_error("Failed to stat %s: %m", arg_image);
1944                 return -errno;
1945         }
1946
1947         if (S_ISBLK(st.st_mode)) {
1948                 char *p;
1949
1950                 p = strdup(arg_image);
1951                 if (!p)
1952                         return log_oom();
1953
1954                 *device_path = p;
1955
1956                 *loop_nr = -1;
1957
1958                 r = fd;
1959                 fd = -1;
1960
1961                 return r;
1962         }
1963
1964         if (!S_ISREG(st.st_mode)) {
1965                 log_error("%s is not a regular file or block device: %m", arg_image);
1966                 return -EINVAL;
1967         }
1968
1969         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1970         if (control < 0) {
1971                 log_error("Failed to open /dev/loop-control: %m");
1972                 return -errno;
1973         }
1974
1975         nr = ioctl(control, LOOP_CTL_GET_FREE);
1976         if (nr < 0) {
1977                 log_error("Failed to allocate loop device: %m");
1978                 return -errno;
1979         }
1980
1981         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1982                 return log_oom();
1983
1984         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1985         if (loop < 0) {
1986                 log_error("Failed to open loop device %s: %m", loopdev);
1987                 return -errno;
1988         }
1989
1990         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1991                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1992                 return -errno;
1993         }
1994
1995         if (arg_read_only)
1996                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1997
1998         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1999                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2000                 return -errno;
2001         }
2002
2003         *device_path = loopdev;
2004         loopdev = NULL;
2005
2006         *loop_nr = nr;
2007
2008         r = loop;
2009         loop = -1;
2010
2011         return r;
2012 }
2013
2014 static int dissect_image(
2015                 int fd,
2016                 char **root_device, bool *root_device_rw,
2017                 char **home_device, bool *home_device_rw,
2018                 char **srv_device, bool *srv_device_rw,
2019                 bool *secondary) {
2020
2021 #ifdef HAVE_BLKID
2022         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2023         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2024         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2025         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2026         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2027         _cleanup_udev_unref_ struct udev *udev = NULL;
2028         struct udev_list_entry *first, *item;
2029         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2030         const char *pttype = NULL;
2031         blkid_partlist pl;
2032         struct stat st;
2033         int r;
2034
2035         assert(fd >= 0);
2036         assert(root_device);
2037         assert(home_device);
2038         assert(srv_device);
2039         assert(secondary);
2040
2041         b = blkid_new_probe();
2042         if (!b)
2043                 return log_oom();
2044
2045         errno = 0;
2046         r = blkid_probe_set_device(b, fd, 0, 0);
2047         if (r != 0) {
2048                 if (errno == 0)
2049                         return log_oom();
2050
2051                 log_error("Failed to set device on blkid probe: %m");
2052                 return -errno;
2053         }
2054
2055         blkid_probe_enable_partitions(b, 1);
2056         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2057
2058         errno = 0;
2059         r = blkid_do_safeprobe(b);
2060         if (r == -2 || r == 1) {
2061                 log_error("Failed to identify any partition table on %s.\n"
2062                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2063                 return -EINVAL;
2064         } else if (r != 0) {
2065                 if (errno == 0)
2066                         errno = EIO;
2067                 log_error("Failed to probe: %m");
2068                 return -errno;
2069         }
2070
2071         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2072         if (!streq_ptr(pttype, "gpt")) {
2073                 log_error("Image %s does not carry a GUID Partition Table.\n"
2074                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2075                 return -EINVAL;
2076         }
2077
2078         errno = 0;
2079         pl = blkid_probe_get_partitions(b);
2080         if (!pl) {
2081                 if (errno == 0)
2082                         return log_oom();
2083
2084                 log_error("Failed to list partitions of %s", arg_image);
2085                 return -errno;
2086         }
2087
2088         udev = udev_new();
2089         if (!udev)
2090                 return log_oom();
2091
2092         if (fstat(fd, &st) < 0) {
2093                 log_error("Failed to stat block device: %m");
2094                 return -errno;
2095         }
2096
2097         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2098         if (!d)
2099                 return log_oom();
2100
2101         e = udev_enumerate_new(udev);
2102         if (!e)
2103                 return log_oom();
2104
2105         r = udev_enumerate_add_match_parent(e, d);
2106         if (r < 0)
2107                 return log_oom();
2108
2109         r = udev_enumerate_scan_devices(e);
2110         if (r < 0) {
2111                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2112                 return r;
2113         }
2114
2115         first = udev_enumerate_get_list_entry(e);
2116         udev_list_entry_foreach(item, first) {
2117                 _cleanup_udev_device_unref_ struct udev_device *q;
2118                 const char *stype, *node;
2119                 unsigned long long flags;
2120                 sd_id128_t type_id;
2121                 blkid_partition pp;
2122                 dev_t qn;
2123                 int nr;
2124
2125                 errno = 0;
2126                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2127                 if (!q) {
2128                         if (!errno)
2129                                 errno = ENOMEM;
2130
2131                         log_error("Failed to get partition device of %s: %m", arg_image);
2132                         return -errno;
2133                 }
2134
2135                 qn = udev_device_get_devnum(q);
2136                 if (major(qn) == 0)
2137                         continue;
2138
2139                 if (st.st_rdev == qn)
2140                         continue;
2141
2142                 node = udev_device_get_devnode(q);
2143                 if (!node)
2144                         continue;
2145
2146                 pp = blkid_partlist_devno_to_partition(pl, qn);
2147                 if (!pp)
2148                         continue;
2149
2150                 flags = blkid_partition_get_flags(pp);
2151                 if (flags & GPT_FLAG_NO_AUTO)
2152                         continue;
2153
2154                 nr = blkid_partition_get_partno(pp);
2155                 if (nr < 0)
2156                         continue;
2157
2158                 stype = blkid_partition_get_type_string(pp);
2159                 if (!stype)
2160                         continue;
2161
2162                 if (sd_id128_from_string(stype, &type_id) < 0)
2163                         continue;
2164
2165                 if (sd_id128_equal(type_id, GPT_HOME)) {
2166
2167                         if (home && nr >= home_nr)
2168                                 continue;
2169
2170                         home_nr = nr;
2171                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2172
2173                         free(home);
2174                         home = strdup(node);
2175                         if (!home)
2176                                 return log_oom();
2177                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2178
2179                         if (srv && nr >= srv_nr)
2180                                 continue;
2181
2182                         srv_nr = nr;
2183                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2184
2185                         free(srv);
2186                         srv = strdup(node);
2187                         if (!srv)
2188                                 return log_oom();
2189                 }
2190 #ifdef GPT_ROOT_NATIVE
2191                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2192
2193                         if (root && nr >= root_nr)
2194                                 continue;
2195
2196                         root_nr = nr;
2197                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2198
2199                         free(root);
2200                         root = strdup(node);
2201                         if (!root)
2202                                 return log_oom();
2203                 }
2204 #endif
2205 #ifdef GPT_ROOT_SECONDARY
2206                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2207
2208                         if (secondary_root && nr >= secondary_root_nr)
2209                                 continue;
2210
2211                         secondary_root_nr = nr;
2212                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2213
2214
2215                         free(secondary_root);
2216                         secondary_root = strdup(node);
2217                         if (!secondary_root)
2218                                 return log_oom();
2219                 }
2220 #endif
2221         }
2222
2223         if (!root && !secondary_root) {
2224                 log_error("Failed to identify root partition in disk image %s.\n"
2225                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2226                 return -EINVAL;
2227         }
2228
2229         if (root) {
2230                 *root_device = root;
2231                 root = NULL;
2232
2233                 *root_device_rw = root_rw;
2234                 *secondary = false;
2235         } else if (secondary_root) {
2236                 *root_device = secondary_root;
2237                 secondary_root = NULL;
2238
2239                 *root_device_rw = secondary_root_rw;
2240                 *secondary = true;
2241         }
2242
2243         if (home) {
2244                 *home_device = home;
2245                 home = NULL;
2246
2247                 *home_device_rw = home_rw;
2248         }
2249
2250         if (srv) {
2251                 *srv_device = srv;
2252                 srv = NULL;
2253
2254                 *srv_device_rw = srv_rw;
2255         }
2256
2257         return 0;
2258 #else
2259         log_error("--image= is not supported, compiled without blkid support.");
2260         return -ENOTSUP;
2261 #endif
2262 }
2263
2264 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2265 #ifdef HAVE_BLKID
2266         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2267         const char *fstype, *p;
2268         int r;
2269
2270         assert(what);
2271         assert(where);
2272
2273         if (arg_read_only)
2274                 rw = false;
2275
2276         if (directory)
2277                 p = strappenda(where, directory);
2278         else
2279                 p = where;
2280
2281         errno = 0;
2282         b = blkid_new_probe_from_filename(what);
2283         if (!b) {
2284                 if (errno == 0)
2285                         return log_oom();
2286                 log_error("Failed to allocate prober for %s: %m", what);
2287                 return -errno;
2288         }
2289
2290         blkid_probe_enable_superblocks(b, 1);
2291         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2292
2293         errno = 0;
2294         r = blkid_do_safeprobe(b);
2295         if (r == -1 || r == 1) {
2296                 log_error("Cannot determine file system type of %s", what);
2297                 return -EINVAL;
2298         } else if (r != 0) {
2299                 if (errno == 0)
2300                         errno = EIO;
2301                 log_error("Failed to probe %s: %m", what);
2302                 return -errno;
2303         }
2304
2305         errno = 0;
2306         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2307                 if (errno == 0)
2308                         errno = EINVAL;
2309                 log_error("Failed to determine file system type of %s", what);
2310                 return -errno;
2311         }
2312
2313         if (streq(fstype, "crypto_LUKS")) {
2314                 log_error("nspawn currently does not support LUKS disk images.");
2315                 return -ENOTSUP;
2316         }
2317
2318         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2319                 log_error("Failed to mount %s: %m", what);
2320                 return -errno;
2321         }
2322
2323         return 0;
2324 #else
2325         log_error("--image= is not supported, compiled without blkid support.");
2326         return -ENOTSUP;
2327 #endif
2328 }
2329
2330 static int mount_devices(
2331                 const char *where,
2332                 const char *root_device, bool root_device_rw,
2333                 const char *home_device, bool home_device_rw,
2334                 const char *srv_device, bool srv_device_rw) {
2335         int r;
2336
2337         assert(where);
2338
2339         if (root_device) {
2340                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2341                 if (r < 0) {
2342                         log_error("Failed to mount root directory: %s", strerror(-r));
2343                         return r;
2344                 }
2345         }
2346
2347         if (home_device) {
2348                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2349                 if (r < 0) {
2350                         log_error("Failed to mount home directory: %s", strerror(-r));
2351                         return r;
2352                 }
2353         }
2354
2355         if (srv_device) {
2356                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2357                 if (r < 0) {
2358                         log_error("Failed to mount server data directory: %s", strerror(-r));
2359                         return r;
2360                 }
2361         }
2362
2363         return 0;
2364 }
2365
2366 static void loop_remove(int nr, int *image_fd) {
2367         _cleanup_close_ int control = -1;
2368
2369         if (nr < 0)
2370                 return;
2371
2372         if (image_fd && *image_fd >= 0) {
2373                 ioctl(*image_fd, LOOP_CLR_FD);
2374                 *image_fd = safe_close(*image_fd);
2375         }
2376
2377         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2378         if (control < 0)
2379                 return;
2380
2381         ioctl(control, LOOP_CTL_REMOVE, nr);
2382 }
2383
2384 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2385         int pipe_fds[2];
2386         pid_t pid;
2387
2388         assert(database);
2389         assert(key);
2390         assert(rpid);
2391
2392         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2393                 log_error("Failed to allocate pipe: %m");
2394                 return -errno;
2395         }
2396
2397         pid = fork();
2398         if (pid < 0) {
2399                 log_error("Failed to fork getent child: %m");
2400                 return -errno;
2401         } else if (pid == 0) {
2402                 int nullfd;
2403                 char *empty_env = NULL;
2404
2405                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2406                         _exit(EXIT_FAILURE);
2407
2408                 if (pipe_fds[0] > 2)
2409                         safe_close(pipe_fds[0]);
2410                 if (pipe_fds[1] > 2)
2411                         safe_close(pipe_fds[1]);
2412
2413                 nullfd = open("/dev/null", O_RDWR);
2414                 if (nullfd < 0)
2415                         _exit(EXIT_FAILURE);
2416
2417                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2418                         _exit(EXIT_FAILURE);
2419
2420                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2421                         _exit(EXIT_FAILURE);
2422
2423                 if (nullfd > 2)
2424                         safe_close(nullfd);
2425
2426                 reset_all_signal_handlers();
2427                 close_all_fds(NULL, 0);
2428
2429                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2430                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2431                 _exit(EXIT_FAILURE);
2432         }
2433
2434         pipe_fds[1] = safe_close(pipe_fds[1]);
2435
2436         *rpid = pid;
2437
2438         return pipe_fds[0];
2439 }
2440
2441 static int change_uid_gid(char **_home) {
2442         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2443         _cleanup_free_ uid_t *uids = NULL;
2444         _cleanup_free_ char *home = NULL;
2445         _cleanup_fclose_ FILE *f = NULL;
2446         _cleanup_close_ int fd = -1;
2447         unsigned n_uids = 0;
2448         size_t sz = 0, l;
2449         uid_t uid;
2450         gid_t gid;
2451         pid_t pid;
2452         int r;
2453
2454         assert(_home);
2455
2456         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2457                 /* Reset everything fully to 0, just in case */
2458
2459                 if (setgroups(0, NULL) < 0) {
2460                         log_error("setgroups() failed: %m");
2461                         return -errno;
2462                 }
2463
2464                 if (setresgid(0, 0, 0) < 0) {
2465                         log_error("setregid() failed: %m");
2466                         return -errno;
2467                 }
2468
2469                 if (setresuid(0, 0, 0) < 0) {
2470                         log_error("setreuid() failed: %m");
2471                         return -errno;
2472                 }
2473
2474                 *_home = NULL;
2475                 return 0;
2476         }
2477
2478         /* First, get user credentials */
2479         fd = spawn_getent("passwd", arg_user, &pid);
2480         if (fd < 0)
2481                 return fd;
2482
2483         f = fdopen(fd, "r");
2484         if (!f)
2485                 return log_oom();
2486         fd = -1;
2487
2488         if (!fgets(line, sizeof(line), f)) {
2489
2490                 if (!ferror(f)) {
2491                         log_error("Failed to resolve user %s.", arg_user);
2492                         return -ESRCH;
2493                 }
2494
2495                 log_error("Failed to read from getent: %m");
2496                 return -errno;
2497         }
2498
2499         truncate_nl(line);
2500
2501         wait_for_terminate_and_warn("getent passwd", pid);
2502
2503         x = strchr(line, ':');
2504         if (!x) {
2505                 log_error("/etc/passwd entry has invalid user field.");
2506                 return -EIO;
2507         }
2508
2509         u = strchr(x+1, ':');
2510         if (!u) {
2511                 log_error("/etc/passwd entry has invalid password field.");
2512                 return -EIO;
2513         }
2514
2515         u++;
2516         g = strchr(u, ':');
2517         if (!g) {
2518                 log_error("/etc/passwd entry has invalid UID field.");
2519                 return -EIO;
2520         }
2521
2522         *g = 0;
2523         g++;
2524         x = strchr(g, ':');
2525         if (!x) {
2526                 log_error("/etc/passwd entry has invalid GID field.");
2527                 return -EIO;
2528         }
2529
2530         *x = 0;
2531         h = strchr(x+1, ':');
2532         if (!h) {
2533                 log_error("/etc/passwd entry has invalid GECOS field.");
2534                 return -EIO;
2535         }
2536
2537         h++;
2538         x = strchr(h, ':');
2539         if (!x) {
2540                 log_error("/etc/passwd entry has invalid home directory field.");
2541                 return -EIO;
2542         }
2543
2544         *x = 0;
2545
2546         r = parse_uid(u, &uid);
2547         if (r < 0) {
2548                 log_error("Failed to parse UID of user.");
2549                 return -EIO;
2550         }
2551
2552         r = parse_gid(g, &gid);
2553         if (r < 0) {
2554                 log_error("Failed to parse GID of user.");
2555                 return -EIO;
2556         }
2557
2558         home = strdup(h);
2559         if (!home)
2560                 return log_oom();
2561
2562         /* Second, get group memberships */
2563         fd = spawn_getent("initgroups", arg_user, &pid);
2564         if (fd < 0)
2565                 return fd;
2566
2567         fclose(f);
2568         f = fdopen(fd, "r");
2569         if (!f)
2570                 return log_oom();
2571         fd = -1;
2572
2573         if (!fgets(line, sizeof(line), f)) {
2574                 if (!ferror(f)) {
2575                         log_error("Failed to resolve user %s.", arg_user);
2576                         return -ESRCH;
2577                 }
2578
2579                 log_error("Failed to read from getent: %m");
2580                 return -errno;
2581         }
2582
2583         truncate_nl(line);
2584
2585         wait_for_terminate_and_warn("getent initgroups", pid);
2586
2587         /* Skip over the username and subsequent separator whitespace */
2588         x = line;
2589         x += strcspn(x, WHITESPACE);
2590         x += strspn(x, WHITESPACE);
2591
2592         FOREACH_WORD(w, l, x, state) {
2593                 char c[l+1];
2594
2595                 memcpy(c, w, l);
2596                 c[l] = 0;
2597
2598                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2599                         return log_oom();
2600
2601                 r = parse_uid(c, &uids[n_uids++]);
2602                 if (r < 0) {
2603                         log_error("Failed to parse group data from getent.");
2604                         return -EIO;
2605                 }
2606         }
2607
2608         r = mkdir_parents(home, 0775);
2609         if (r < 0) {
2610                 log_error("Failed to make home root directory: %s", strerror(-r));
2611                 return r;
2612         }
2613
2614         r = mkdir_safe(home, 0755, uid, gid);
2615         if (r < 0 && r != -EEXIST) {
2616                 log_error("Failed to make home directory: %s", strerror(-r));
2617                 return r;
2618         }
2619
2620         fchown(STDIN_FILENO, uid, gid);
2621         fchown(STDOUT_FILENO, uid, gid);
2622         fchown(STDERR_FILENO, uid, gid);
2623
2624         if (setgroups(n_uids, uids) < 0) {
2625                 log_error("Failed to set auxiliary groups: %m");
2626                 return -errno;
2627         }
2628
2629         if (setresgid(gid, gid, gid) < 0) {
2630                 log_error("setregid() failed: %m");
2631                 return -errno;
2632         }
2633
2634         if (setresuid(uid, uid, uid) < 0) {
2635                 log_error("setreuid() failed: %m");
2636                 return -errno;
2637         }
2638
2639         if (_home) {
2640                 *_home = home;
2641                 home = NULL;
2642         }
2643
2644         return 0;
2645 }
2646
2647 /*
2648  * Return 0 in case the container is being rebooted, has been shut
2649  * down or exited successfully. On failures a negative value is
2650  * returned.
2651  *
2652  * The status of the container "CONTAINER_TERMINATED" or
2653  * "CONTAINER_REBOOTED" will be saved in the container argument
2654  */
2655 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2656         int r;
2657         siginfo_t status;
2658
2659         r = wait_for_terminate(pid, &status);
2660         if (r < 0)
2661                 return r;
2662
2663         switch (status.si_code) {
2664         case CLD_EXITED:
2665                 r = status.si_status;
2666                 if (r == 0) {
2667                         if (!arg_quiet)
2668                                 log_debug("Container %s exited successfully.",
2669                                           arg_machine);
2670
2671                         *container = CONTAINER_TERMINATED;
2672                 } else {
2673                         log_error("Container %s failed with error code %i.",
2674                                   arg_machine, status.si_status);
2675                         r = -1;
2676                 }
2677                 break;
2678
2679         case CLD_KILLED:
2680                 if (status.si_status == SIGINT) {
2681                         if (!arg_quiet)
2682                                 log_info("Container %s has been shut down.",
2683                                          arg_machine);
2684
2685                         *container = CONTAINER_TERMINATED;
2686                         r = 0;
2687                         break;
2688                 } else if (status.si_status == SIGHUP) {
2689                         if (!arg_quiet)
2690                                 log_info("Container %s is being rebooted.",
2691                                          arg_machine);
2692
2693                         *container = CONTAINER_REBOOTED;
2694                         r = 0;
2695                         break;
2696                 }
2697                 /* CLD_KILLED fallthrough */
2698
2699         case CLD_DUMPED:
2700                 log_error("Container %s terminated by signal %s.",
2701                           arg_machine, signal_to_string(status.si_status));
2702                 r = -1;
2703                 break;
2704
2705         default:
2706                 log_error("Container %s failed due to unknown reason.",
2707                           arg_machine);
2708                 r = -1;
2709                 break;
2710         }
2711
2712         return r;
2713 }
2714
2715 static void nop_handler(int sig) {}
2716
2717 int main(int argc, char *argv[]) {
2718
2719         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2720         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2721         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2722         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2723         _cleanup_fdset_free_ FDSet *fds = NULL;
2724         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2725         const char *console = NULL;
2726         char veth_name[IFNAMSIZ];
2727         bool secondary = false;
2728         sigset_t mask, mask_chld;
2729         pid_t pid = 0;
2730
2731         log_parse_environment();
2732         log_open();
2733
2734         k = parse_argv(argc, argv);
2735         if (k < 0)
2736                 goto finish;
2737         else if (k == 0) {
2738                 r = EXIT_SUCCESS;
2739                 goto finish;
2740         }
2741
2742         if (!arg_image) {
2743                 if (arg_directory) {
2744                         char *p;
2745
2746                         p = path_make_absolute_cwd(arg_directory);
2747                         free(arg_directory);
2748                         arg_directory = p;
2749                 } else
2750                         arg_directory = get_current_dir_name();
2751
2752                 if (!arg_directory) {
2753                         log_error("Failed to determine path, please use -D.");
2754                         goto finish;
2755                 }
2756                 path_kill_slashes(arg_directory);
2757         }
2758
2759         if (!arg_machine) {
2760                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2761                 if (!arg_machine) {
2762                         log_oom();
2763                         goto finish;
2764                 }
2765
2766                 hostname_cleanup(arg_machine, false);
2767                 if (isempty(arg_machine)) {
2768                         log_error("Failed to determine machine name automatically, please use -M.");
2769                         goto finish;
2770                 }
2771         }
2772
2773         if (geteuid() != 0) {
2774                 log_error("Need to be root.");
2775                 goto finish;
2776         }
2777
2778         if (sd_booted() <= 0) {
2779                 log_error("Not running on a systemd system.");
2780                 goto finish;
2781         }
2782
2783         log_close();
2784         n_fd_passed = sd_listen_fds(false);
2785         if (n_fd_passed > 0) {
2786                 k = fdset_new_listen_fds(&fds, false);
2787                 if (k < 0) {
2788                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2789                         goto finish;
2790                 }
2791         }
2792         fdset_close_others(fds);
2793         log_open();
2794
2795         if (arg_directory) {
2796                 if (path_equal(arg_directory, "/")) {
2797                         log_error("Spawning container on root directory not supported.");
2798                         goto finish;
2799                 }
2800
2801                 if (arg_boot) {
2802                         if (path_is_os_tree(arg_directory) <= 0) {
2803                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2804                                 goto finish;
2805                         }
2806                 } else {
2807                         const char *p;
2808
2809                         p = strappenda(arg_directory,
2810                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2811                         if (access(p, F_OK) < 0) {
2812                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2813                                 goto finish;
2814
2815                         }
2816                 }
2817         } else {
2818                 char template[] = "/tmp/nspawn-root-XXXXXX";
2819
2820                 if (!mkdtemp(template)) {
2821                         log_error("Failed to create temporary directory: %m");
2822                         r = -errno;
2823                         goto finish;
2824                 }
2825
2826                 arg_directory = strdup(template);
2827                 if (!arg_directory) {
2828                         r = log_oom();
2829                         goto finish;
2830                 }
2831
2832                 image_fd = setup_image(&device_path, &loop_nr);
2833                 if (image_fd < 0) {
2834                         r = image_fd;
2835                         goto finish;
2836                 }
2837
2838                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2839                 if (r < 0)
2840                         goto finish;
2841         }
2842
2843         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2844         if (master < 0) {
2845                 log_error("Failed to acquire pseudo tty: %m");
2846                 goto finish;
2847         }
2848
2849         console = ptsname(master);
2850         if (!console) {
2851                 log_error("Failed to determine tty name: %m");
2852                 goto finish;
2853         }
2854
2855         if (!arg_quiet)
2856                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2857                          arg_machine, arg_image ? arg_image : arg_directory);
2858
2859         if (unlockpt(master) < 0) {
2860                 log_error("Failed to unlock tty: %m");
2861                 goto finish;
2862         }
2863
2864         if (access("/dev/kdbus/control", F_OK) >= 0) {
2865
2866                 if (arg_share_system) {
2867                         kdbus_domain = strdup("/dev/kdbus");
2868                         if (!kdbus_domain) {
2869                                 log_oom();
2870                                 goto finish;
2871                         }
2872                 } else {
2873                         const char *ns;
2874
2875                         ns = strappenda("machine-", arg_machine);
2876                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2877                         if (r < 0)
2878                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2879                         else
2880                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2881                 }
2882         }
2883
2884         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2885                 log_error("Failed to create kmsg socket pair: %m");
2886                 goto finish;
2887         }
2888
2889         sd_notify(0, "READY=1");
2890
2891         assert_se(sigemptyset(&mask) == 0);
2892         assert_se(sigemptyset(&mask_chld) == 0);
2893         sigaddset(&mask_chld, SIGCHLD);
2894         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2895         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2896
2897         for (;;) {
2898                 ContainerStatus container_status;
2899                 int eventfds[2] = { -1, -1 };
2900                 struct sigaction sa = {
2901                         .sa_handler = nop_handler,
2902                         .sa_flags = SA_NOCLDSTOP,
2903                 };
2904
2905                 /* Child can be killed before execv(), so handle SIGCHLD
2906                  * in order to interrupt parent's blocking calls and
2907                  * give it a chance to call wait() and terminate. */
2908                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2909                 if (r < 0) {
2910                         log_error("Failed to change the signal mask: %m");
2911                         goto finish;
2912                 }
2913
2914                 r = sigaction(SIGCHLD, &sa, NULL);
2915                 if (r < 0) {
2916                         log_error("Failed to install SIGCHLD handler: %m");
2917                         goto finish;
2918                 }
2919
2920                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2921                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2922                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2923                 if (pid < 0) {
2924                         if (errno == EINVAL)
2925                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2926                         else
2927                                 log_error("clone() failed: %m");
2928
2929                         r = pid;
2930                         goto finish;
2931                 }
2932
2933                 if (pid == 0) {
2934                         /* child */
2935                         _cleanup_free_ char *home = NULL;
2936                         unsigned n_env = 2;
2937                         const char *envp[] = {
2938                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2939                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2940                                 NULL, /* TERM */
2941                                 NULL, /* HOME */
2942                                 NULL, /* USER */
2943                                 NULL, /* LOGNAME */
2944                                 NULL, /* container_uuid */
2945                                 NULL, /* LISTEN_FDS */
2946                                 NULL, /* LISTEN_PID */
2947                                 NULL
2948                         };
2949                         char **env_use;
2950
2951                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2952                         if (envp[n_env])
2953                                 n_env ++;
2954
2955                         master = safe_close(master);
2956
2957                         close_nointr(STDIN_FILENO);
2958                         close_nointr(STDOUT_FILENO);
2959                         close_nointr(STDERR_FILENO);
2960
2961                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2962
2963                         reset_all_signal_handlers();
2964
2965                         assert_se(sigemptyset(&mask) == 0);
2966                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2967
2968                         k = open_terminal(console, O_RDWR);
2969                         if (k != STDIN_FILENO) {
2970                                 if (k >= 0) {
2971                                         safe_close(k);
2972                                         k = -EINVAL;
2973                                 }
2974
2975                                 log_error("Failed to open console: %s", strerror(-k));
2976                                 goto child_fail;
2977                         }
2978
2979                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2980                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2981                                 log_error("Failed to duplicate console: %m");
2982                                 goto child_fail;
2983                         }
2984
2985                         if (setsid() < 0) {
2986                                 log_error("setsid() failed: %m");
2987                                 goto child_fail;
2988                         }
2989
2990                         if (reset_audit_loginuid() < 0)
2991                                 goto child_fail;
2992
2993                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2994                                 log_error("PR_SET_PDEATHSIG failed: %m");
2995                                 goto child_fail;
2996                         }
2997
2998                         /* Mark everything as slave, so that we still
2999                          * receive mounts from the real root, but don't
3000                          * propagate mounts to the real root. */
3001                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3002                                 log_error("MS_SLAVE|MS_REC failed: %m");
3003                                 goto child_fail;
3004                         }
3005
3006                         if (mount_devices(arg_directory,
3007                                           root_device, root_device_rw,
3008                                           home_device, home_device_rw,
3009                                           srv_device, srv_device_rw) < 0)
3010                                 goto child_fail;
3011
3012                         r = base_filesystem_create(arg_directory);
3013                         if (r < 0) {
3014                                 log_error("creating base filesystem failed: %s", strerror(-r));
3015                                 goto child_fail;
3016                         }
3017
3018                         /* Turn directory into bind mount */
3019                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3020                                 log_error("Failed to make bind mount: %m");
3021                                 goto child_fail;
3022                         }
3023
3024                         if (arg_read_only) {
3025                                 k = bind_remount_recursive(arg_directory, true);
3026                                 if (k < 0) {
3027                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3028                                         goto child_fail;
3029                                 }
3030                         }
3031
3032                         if (mount_all(arg_directory) < 0)
3033                                 goto child_fail;
3034
3035                         if (copy_devnodes(arg_directory) < 0)
3036                                 goto child_fail;
3037
3038                         if (setup_ptmx(arg_directory) < 0)
3039                                 goto child_fail;
3040
3041                         dev_setup(arg_directory);
3042
3043                         if (audit_still_doesnt_work_in_containers() < 0)
3044                                 goto child_fail;
3045
3046                         if (setup_dev_console(arg_directory, console) < 0)
3047                                 goto child_fail;
3048
3049                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3050                                 goto child_fail;
3051
3052                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3053
3054                         if (setup_boot_id(arg_directory) < 0)
3055                                 goto child_fail;
3056
3057                         if (setup_timezone(arg_directory) < 0)
3058                                 goto child_fail;
3059
3060                         if (setup_resolv_conf(arg_directory) < 0)
3061                                 goto child_fail;
3062
3063                         if (setup_journal(arg_directory) < 0)
3064                                 goto child_fail;
3065
3066                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3067                                 goto child_fail;
3068
3069                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3070                                 goto child_fail;
3071
3072                         if (mount_tmpfs(arg_directory) < 0)
3073                                 goto child_fail;
3074
3075                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3076                                 goto child_fail;
3077
3078                         /* Tell the parent that we are ready, and that
3079                          * it can cgroupify us to that we lack access
3080                          * to certain devices and resources. */
3081                         r = eventfd_send_state(eventfds[1],
3082                                                EVENTFD_CHILD_SUCCEEDED);
3083                         eventfds[1] = safe_close(eventfds[1]);
3084                         if (r < 0)
3085                                 goto child_fail;
3086
3087                         if (chdir(arg_directory) < 0) {
3088                                 log_error("chdir(%s) failed: %m", arg_directory);
3089                                 goto child_fail;
3090                         }
3091
3092                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3093                                 log_error("mount(MS_MOVE) failed: %m");
3094                                 goto child_fail;
3095                         }
3096
3097                         if (chroot(".") < 0) {
3098                                 log_error("chroot() failed: %m");
3099                                 goto child_fail;
3100                         }
3101
3102                         if (chdir("/") < 0) {
3103                                 log_error("chdir() failed: %m");
3104                                 goto child_fail;
3105                         }
3106
3107                         umask(0022);
3108
3109                         if (arg_private_network)
3110                                 loopback_setup();
3111
3112                         if (drop_capabilities() < 0) {
3113                                 log_error("drop_capabilities() failed: %m");
3114                                 goto child_fail;
3115                         }
3116
3117                         r = change_uid_gid(&home);
3118                         if (r < 0)
3119                                 goto child_fail;
3120
3121                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3122                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3123                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3124                                 log_oom();
3125                                 goto child_fail;
3126                         }
3127
3128                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3129                                 char as_uuid[37];
3130
3131                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3132                                         log_oom();
3133                                         goto child_fail;
3134                                 }
3135                         }
3136
3137                         if (fdset_size(fds) > 0) {
3138                                 k = fdset_cloexec(fds, false);
3139                                 if (k < 0) {
3140                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3141                                         goto child_fail;
3142                                 }
3143
3144                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3145                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3146                                         log_oom();
3147                                         goto child_fail;
3148                                 }
3149                         }
3150
3151                         setup_hostname();
3152
3153                         if (arg_personality != 0xffffffffLU) {
3154                                 if (personality(arg_personality) < 0) {
3155                                         log_error("personality() failed: %m");
3156                                         goto child_fail;
3157                                 }
3158                         } else if (secondary) {
3159                                 if (personality(PER_LINUX32) < 0) {
3160                                         log_error("personality() failed: %m");
3161                                         goto child_fail;
3162                                 }
3163                         }
3164
3165 #ifdef HAVE_SELINUX
3166                         if (arg_selinux_context)
3167                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3168                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3169                                         goto child_fail;
3170                                 }
3171 #endif
3172
3173                         if (!strv_isempty(arg_setenv)) {
3174                                 char **n;
3175
3176                                 n = strv_env_merge(2, envp, arg_setenv);
3177                                 if (!n) {
3178                                         log_oom();
3179                                         goto child_fail;
3180                                 }
3181
3182                                 env_use = n;
3183                         } else
3184                                 env_use = (char**) envp;
3185
3186                         /* Wait until the parent is ready with the setup, too... */
3187                         r = eventfd_parent_succeeded(eventfds[0]);
3188                         eventfds[0] = safe_close(eventfds[0]);
3189                         if (r < 0)
3190                                 goto child_fail;
3191
3192                         if (arg_boot) {
3193                                 char **a;
3194                                 size_t l;
3195
3196                                 /* Automatically search for the init system */
3197
3198                                 l = 1 + argc - optind;
3199                                 a = newa(char*, l + 1);
3200                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3201
3202                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3203                                 execve(a[0], a, env_use);
3204
3205                                 a[0] = (char*) "/lib/systemd/systemd";
3206                                 execve(a[0], a, env_use);
3207
3208                                 a[0] = (char*) "/sbin/init";
3209                                 execve(a[0], a, env_use);
3210                         } else if (argc > optind)
3211                                 execvpe(argv[optind], argv + optind, env_use);
3212                         else {
3213                                 chdir(home ? home : "/root");
3214                                 execle("/bin/bash", "-bash", NULL, env_use);
3215                                 execle("/bin/sh", "-sh", NULL, env_use);
3216                         }
3217
3218                         log_error("execv() failed: %m");
3219
3220                 child_fail:
3221                         /* Tell the parent that the setup failed, so he
3222                          * can clean up resources and terminate. */
3223                         if (eventfds[1] != -1)
3224                                 eventfd_send_state(eventfds[1],
3225                                                    EVENTFD_CHILD_FAILED);
3226                         _exit(EXIT_FAILURE);
3227                 }
3228
3229                 fdset_free(fds);
3230                 fds = NULL;
3231
3232                 /* Wait for the child event:
3233                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3234                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3235                  * it is ready with all it needs to do with priviliges.
3236                  * After we got the notification we can make the process
3237                  * join its cgroup which might limit what it can do */
3238                 r = eventfd_child_succeeded(eventfds[1]);
3239                 eventfds[1] = safe_close(eventfds[1]);
3240                 if (r < 0)
3241                         goto check_container_status;
3242
3243                 r = register_machine(pid);
3244                 if (r < 0)
3245                         goto finish;
3246
3247                 r = move_network_interfaces(pid);
3248                 if (r < 0)
3249                         goto finish;
3250
3251                 r = setup_veth(pid, veth_name);
3252                 if (r < 0)
3253                         goto finish;
3254
3255                 r = setup_bridge(veth_name);
3256                 if (r < 0)
3257                         goto finish;
3258
3259                 r = setup_macvlan(pid);
3260                 if (r < 0)
3261                         goto finish;
3262
3263                 /* Block SIGCHLD here, before notifying child.
3264                  * process_pty() will handle it with the other signals. */
3265                 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3266                 if (r < 0)
3267                         goto finish;
3268
3269                 /* Reset signal to default */
3270                 r = default_signals(SIGCHLD, -1);
3271                 if (r < 0)
3272                         goto finish;
3273
3274                 /* Notify the child that the parent is ready with all
3275                  * its setup, and that the child can now hand over
3276                  * control to the code to run inside the container. */
3277                 r = eventfd_send_state(eventfds[0],
3278                                        EVENTFD_PARENT_SUCCEEDED);
3279                 eventfds[0] = safe_close(eventfds[0]);
3280                 if (r < 0)
3281                         goto finish;
3282
3283                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3284                 if (k < 0) {
3285                         r = EXIT_FAILURE;
3286                         break;
3287                 }
3288
3289                 if (!arg_quiet)
3290                         putc('\n', stdout);
3291
3292                 /* Kill if it is not dead yet anyway */
3293                 terminate_machine(pid);
3294
3295 check_container_status:
3296                 /* Redundant, but better safe than sorry */
3297                 kill(pid, SIGKILL);
3298
3299                 r = wait_for_container(pid, &container_status);
3300                 pid = 0;
3301
3302                 if (r < 0) {
3303                         r = EXIT_FAILURE;
3304                         break;
3305                 } else if (container_status == CONTAINER_TERMINATED)
3306                         break;
3307
3308                 /* CONTAINER_REBOOTED, loop again */
3309         }
3310
3311 finish:
3312         loop_remove(loop_nr, &image_fd);
3313
3314         if (pid > 0)
3315                 kill(pid, SIGKILL);
3316
3317         free(arg_directory);
3318         free(arg_machine);
3319         free(arg_user);
3320         strv_free(arg_setenv);
3321         strv_free(arg_network_interfaces);
3322         strv_free(arg_network_macvlan);
3323         strv_free(arg_bind);
3324         strv_free(arg_bind_ro);
3325         strv_free(arg_tmpfs);
3326
3327         return r;
3328 }