chiark / gitweb /
os-release: define /usr/lib/os-release as fallback for /etc/os-release
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91 #include "copy.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 static char *arg_directory = NULL;
110 static char *arg_user = NULL;
111 static sd_id128_t arg_uuid = {};
112 static char *arg_machine = NULL;
113 static const char *arg_selinux_context = NULL;
114 static const char *arg_selinux_apifs_context = NULL;
115 static const char *arg_slice = NULL;
116 static bool arg_private_network = false;
117 static bool arg_read_only = false;
118 static bool arg_boot = false;
119 static LinkJournal arg_link_journal = LINK_AUTO;
120 static uint64_t arg_retain =
121         (1ULL << CAP_CHOWN) |
122         (1ULL << CAP_DAC_OVERRIDE) |
123         (1ULL << CAP_DAC_READ_SEARCH) |
124         (1ULL << CAP_FOWNER) |
125         (1ULL << CAP_FSETID) |
126         (1ULL << CAP_IPC_OWNER) |
127         (1ULL << CAP_KILL) |
128         (1ULL << CAP_LEASE) |
129         (1ULL << CAP_LINUX_IMMUTABLE) |
130         (1ULL << CAP_NET_BIND_SERVICE) |
131         (1ULL << CAP_NET_BROADCAST) |
132         (1ULL << CAP_NET_RAW) |
133         (1ULL << CAP_SETGID) |
134         (1ULL << CAP_SETFCAP) |
135         (1ULL << CAP_SETPCAP) |
136         (1ULL << CAP_SETUID) |
137         (1ULL << CAP_SYS_ADMIN) |
138         (1ULL << CAP_SYS_CHROOT) |
139         (1ULL << CAP_SYS_NICE) |
140         (1ULL << CAP_SYS_PTRACE) |
141         (1ULL << CAP_SYS_TTY_CONFIG) |
142         (1ULL << CAP_SYS_RESOURCE) |
143         (1ULL << CAP_SYS_BOOT) |
144         (1ULL << CAP_AUDIT_WRITE) |
145         (1ULL << CAP_AUDIT_CONTROL) |
146         (1ULL << CAP_MKNOD);
147 static char **arg_bind = NULL;
148 static char **arg_bind_ro = NULL;
149 static char **arg_tmpfs = NULL;
150 static char **arg_setenv = NULL;
151 static bool arg_quiet = false;
152 static bool arg_share_system = false;
153 static bool arg_register = true;
154 static bool arg_keep_unit = false;
155 static char **arg_network_interfaces = NULL;
156 static char **arg_network_macvlan = NULL;
157 static bool arg_network_veth = false;
158 static const char *arg_network_bridge = NULL;
159 static unsigned long arg_personality = 0xffffffffLU;
160 static const char *arg_image = NULL;
161
162 static int help(void) {
163
164         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
165                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
166                "  -h --help                 Show this help\n"
167                "     --version              Print version string\n"
168                "  -q --quiet                Do not show status information\n"
169                "  -D --directory=PATH       Root directory for the container\n"
170                "  -i --image=PATH           File system device or image for the container\n"
171                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
172                "  -u --user=USER            Run the command under specified user or uid\n"
173                "  -M --machine=NAME         Set the machine name for the container\n"
174                "     --uuid=UUID            Set a specific machine UUID for the container\n"
175                "  -S --slice=SLICE          Place the container in the specified slice\n"
176                "     --private-network      Disable network in container\n"
177                "     --network-interface=INTERFACE\n"
178                "                            Assign an existing network interface to the\n"
179                "                            container\n"
180                "     --network-macvlan=INTERFACE\n"
181                "                            Create a macvlan network interface based on an\n"
182                "                            existing network interface to the container\n"
183                "     --network-veth         Add a virtual ethernet connection between host\n"
184                "                            and container\n"
185                "     --network-bridge=INTERFACE\n"
186                "                            Add a virtual ethernet connection between host\n"
187                "                            and container and add it to an existing bridge on\n"
188                "                            the host\n"
189                "  -Z --selinux-context=SECLABEL\n"
190                "                            Set the SELinux security context to be used by\n"
191                "                            processes in the container\n"
192                "  -L --selinux-apifs-context=SECLABEL\n"
193                "                            Set the SELinux security context to be used by\n"
194                "                            API/tmpfs file systems in the container\n"
195                "     --capability=CAP       In addition to the default, retain specified\n"
196                "                            capability\n"
197                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
198                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
199                "  -j                        Equivalent to --link-journal=host\n"
200                "     --read-only            Mount the root directory read-only\n"
201                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
202                "                            the container\n"
203                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
204                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
205                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
206                "     --share-system         Share system namespaces with host\n"
207                "     --register=BOOLEAN     Register container as machine\n"
208                "     --keep-unit            Do not register a scope for the machine, reuse\n"
209                "                            the service unit nspawn is running in\n",
210                program_invocation_short_name);
211
212         return 0;
213 }
214
215 static int parse_argv(int argc, char *argv[]) {
216
217         enum {
218                 ARG_VERSION = 0x100,
219                 ARG_PRIVATE_NETWORK,
220                 ARG_UUID,
221                 ARG_READ_ONLY,
222                 ARG_CAPABILITY,
223                 ARG_DROP_CAPABILITY,
224                 ARG_LINK_JOURNAL,
225                 ARG_BIND,
226                 ARG_BIND_RO,
227                 ARG_TMPFS,
228                 ARG_SETENV,
229                 ARG_SHARE_SYSTEM,
230                 ARG_REGISTER,
231                 ARG_KEEP_UNIT,
232                 ARG_NETWORK_INTERFACE,
233                 ARG_NETWORK_MACVLAN,
234                 ARG_NETWORK_VETH,
235                 ARG_NETWORK_BRIDGE,
236                 ARG_PERSONALITY,
237         };
238
239         static const struct option options[] = {
240                 { "help",                  no_argument,       NULL, 'h'                   },
241                 { "version",               no_argument,       NULL, ARG_VERSION           },
242                 { "directory",             required_argument, NULL, 'D'                   },
243                 { "user",                  required_argument, NULL, 'u'                   },
244                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
245                 { "boot",                  no_argument,       NULL, 'b'                   },
246                 { "uuid",                  required_argument, NULL, ARG_UUID              },
247                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
248                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
249                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
250                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
251                 { "bind",                  required_argument, NULL, ARG_BIND              },
252                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
253                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
254                 { "machine",               required_argument, NULL, 'M'                   },
255                 { "slice",                 required_argument, NULL, 'S'                   },
256                 { "setenv",                required_argument, NULL, ARG_SETENV            },
257                 { "selinux-context",       required_argument, NULL, 'Z'                   },
258                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
259                 { "quiet",                 no_argument,       NULL, 'q'                   },
260                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
261                 { "register",              required_argument, NULL, ARG_REGISTER          },
262                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
263                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
264                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
265                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
266                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
267                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
268                 { "image",                 required_argument, NULL, 'i'                   },
269                 {}
270         };
271
272         int c, r;
273         uint64_t plus = 0, minus = 0;
274
275         assert(argc >= 0);
276         assert(argv);
277
278         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
279
280                 switch (c) {
281
282                 case 'h':
283                         return help();
284
285                 case ARG_VERSION:
286                         puts(PACKAGE_STRING);
287                         puts(SYSTEMD_FEATURES);
288                         return 0;
289
290                 case 'D':
291                         free(arg_directory);
292                         arg_directory = canonicalize_file_name(optarg);
293                         if (!arg_directory) {
294                                 log_error("Invalid root directory: %m");
295                                 return -ENOMEM;
296                         }
297
298                         break;
299
300                 case 'i':
301                         arg_image = optarg;
302                         break;
303
304                 case 'u':
305                         free(arg_user);
306                         arg_user = strdup(optarg);
307                         if (!arg_user)
308                                 return log_oom();
309
310                         break;
311
312                 case ARG_NETWORK_BRIDGE:
313                         arg_network_bridge = optarg;
314
315                         /* fall through */
316
317                 case ARG_NETWORK_VETH:
318                         arg_network_veth = true;
319                         arg_private_network = true;
320                         break;
321
322                 case ARG_NETWORK_INTERFACE:
323                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
324                                 return log_oom();
325
326                         arg_private_network = true;
327                         break;
328
329                 case ARG_NETWORK_MACVLAN:
330                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
331                                 return log_oom();
332
333                         /* fall through */
334
335                 case ARG_PRIVATE_NETWORK:
336                         arg_private_network = true;
337                         break;
338
339                 case 'b':
340                         arg_boot = true;
341                         break;
342
343                 case ARG_UUID:
344                         r = sd_id128_from_string(optarg, &arg_uuid);
345                         if (r < 0) {
346                                 log_error("Invalid UUID: %s", optarg);
347                                 return r;
348                         }
349                         break;
350
351                 case 'S':
352                         arg_slice = optarg;
353                         break;
354
355                 case 'M':
356                         if (isempty(optarg)) {
357                                 free(arg_machine);
358                                 arg_machine = NULL;
359                         } else {
360
361                                 if (!hostname_is_valid(optarg)) {
362                                         log_error("Invalid machine name: %s", optarg);
363                                         return -EINVAL;
364                                 }
365
366                                 free(arg_machine);
367                                 arg_machine = strdup(optarg);
368                                 if (!arg_machine)
369                                         return log_oom();
370
371                                 break;
372                         }
373
374                 case 'Z':
375                         arg_selinux_context = optarg;
376                         break;
377
378                 case 'L':
379                         arg_selinux_apifs_context = optarg;
380                         break;
381
382                 case ARG_READ_ONLY:
383                         arg_read_only = true;
384                         break;
385
386                 case ARG_CAPABILITY:
387                 case ARG_DROP_CAPABILITY: {
388                         char *state, *word;
389                         size_t length;
390
391                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
392                                 _cleanup_free_ char *t;
393                                 cap_value_t cap;
394
395                                 t = strndup(word, length);
396                                 if (!t)
397                                         return log_oom();
398
399                                 if (streq(t, "all")) {
400                                         if (c == ARG_CAPABILITY)
401                                                 plus = (uint64_t) -1;
402                                         else
403                                                 minus = (uint64_t) -1;
404                                 } else {
405                                         if (cap_from_name(t, &cap) < 0) {
406                                                 log_error("Failed to parse capability %s.", t);
407                                                 return -EINVAL;
408                                         }
409
410                                         if (c == ARG_CAPABILITY)
411                                                 plus |= 1ULL << (uint64_t) cap;
412                                         else
413                                                 minus |= 1ULL << (uint64_t) cap;
414                                 }
415                         }
416
417                         break;
418                 }
419
420                 case 'j':
421                         arg_link_journal = LINK_GUEST;
422                         break;
423
424                 case ARG_LINK_JOURNAL:
425                         if (streq(optarg, "auto"))
426                                 arg_link_journal = LINK_AUTO;
427                         else if (streq(optarg, "no"))
428                                 arg_link_journal = LINK_NO;
429                         else if (streq(optarg, "guest"))
430                                 arg_link_journal = LINK_GUEST;
431                         else if (streq(optarg, "host"))
432                                 arg_link_journal = LINK_HOST;
433                         else {
434                                 log_error("Failed to parse link journal mode %s", optarg);
435                                 return -EINVAL;
436                         }
437
438                         break;
439
440                 case ARG_BIND:
441                 case ARG_BIND_RO: {
442                         _cleanup_free_ char *a = NULL, *b = NULL;
443                         char *e;
444                         char ***x;
445
446                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
447
448                         e = strchr(optarg, ':');
449                         if (e) {
450                                 a = strndup(optarg, e - optarg);
451                                 b = strdup(e + 1);
452                         } else {
453                                 a = strdup(optarg);
454                                 b = strdup(optarg);
455                         }
456
457                         if (!a || !b)
458                                 return log_oom();
459
460                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
461                                 log_error("Invalid bind mount specification: %s", optarg);
462                                 return -EINVAL;
463                         }
464
465                         r = strv_extend(x, a);
466                         if (r < 0)
467                                 return log_oom();
468
469                         r = strv_extend(x, b);
470                         if (r < 0)
471                                 return log_oom();
472
473                         break;
474                 }
475
476                 case ARG_TMPFS: {
477                         _cleanup_free_ char *a = NULL, *b = NULL;
478                         char *e;
479
480                         e = strchr(optarg, ':');
481                         if (e) {
482                                 a = strndup(optarg, e - optarg);
483                                 b = strdup(e + 1);
484                         } else {
485                                 a = strdup(optarg);
486                                 b = strdup("mode=0755");
487                         }
488
489                         if (!a || !b)
490                                 return log_oom();
491
492                         if (!path_is_absolute(a)) {
493                                 log_error("Invalid tmpfs specification: %s", optarg);
494                                 return -EINVAL;
495                         }
496
497                         r = strv_push(&arg_tmpfs, a);
498                         if (r < 0)
499                                 return log_oom();
500
501                         a = NULL;
502
503                         r = strv_push(&arg_tmpfs, b);
504                         if (r < 0)
505                                 return log_oom();
506
507                         b = NULL;
508
509                         break;
510                 }
511
512                 case ARG_SETENV: {
513                         char **n;
514
515                         if (!env_assignment_is_valid(optarg)) {
516                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
517                                 return -EINVAL;
518                         }
519
520                         n = strv_env_set(arg_setenv, optarg);
521                         if (!n)
522                                 return log_oom();
523
524                         strv_free(arg_setenv);
525                         arg_setenv = n;
526                         break;
527                 }
528
529                 case 'q':
530                         arg_quiet = true;
531                         break;
532
533                 case ARG_SHARE_SYSTEM:
534                         arg_share_system = true;
535                         break;
536
537                 case ARG_REGISTER:
538                         r = parse_boolean(optarg);
539                         if (r < 0) {
540                                 log_error("Failed to parse --register= argument: %s", optarg);
541                                 return r;
542                         }
543
544                         arg_register = r;
545                         break;
546
547                 case ARG_KEEP_UNIT:
548                         arg_keep_unit = true;
549                         break;
550
551                 case ARG_PERSONALITY:
552
553                         arg_personality = personality_from_string(optarg);
554                         if (arg_personality == 0xffffffffLU) {
555                                 log_error("Unknown or unsupported personality '%s'.", optarg);
556                                 return -EINVAL;
557                         }
558
559                         break;
560
561                 case '?':
562                         return -EINVAL;
563
564                 default:
565                         assert_not_reached("Unhandled option");
566                 }
567         }
568
569         if (arg_share_system)
570                 arg_register = false;
571
572         if (arg_boot && arg_share_system) {
573                 log_error("--boot and --share-system may not be combined.");
574                 return -EINVAL;
575         }
576
577         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
578                 log_error("--keep-unit may not be used when invoked from a user session.");
579                 return -EINVAL;
580         }
581
582         if (arg_directory && arg_image) {
583                 log_error("--directory= and --image= may not be combined.");
584                 return -EINVAL;
585         }
586
587         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
588
589         return 1;
590 }
591
592 static int mount_all(const char *dest) {
593
594         typedef struct MountPoint {
595                 const char *what;
596                 const char *where;
597                 const char *type;
598                 const char *options;
599                 unsigned long flags;
600                 bool fatal;
601         } MountPoint;
602
603         static const MountPoint mount_table[] = {
604                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
605                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
606                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
607                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
608                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
609                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
610                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
611                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
612 #ifdef HAVE_SELINUX
613                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
614                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
615 #endif
616         };
617
618         unsigned k;
619         int r = 0;
620
621         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
622                 _cleanup_free_ char *where = NULL;
623 #ifdef HAVE_SELINUX
624                 _cleanup_free_ char *options = NULL;
625 #endif
626                 const char *o;
627                 int t;
628
629                 where = strjoin(dest, "/", mount_table[k].where, NULL);
630                 if (!where)
631                         return log_oom();
632
633                 t = path_is_mount_point(where, true);
634                 if (t < 0) {
635                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
636
637                         if (r == 0)
638                                 r = t;
639
640                         continue;
641                 }
642
643                 /* Skip this entry if it is not a remount. */
644                 if (mount_table[k].what && t > 0)
645                         continue;
646
647                 mkdir_p(where, 0755);
648
649 #ifdef HAVE_SELINUX
650                 if (arg_selinux_apifs_context &&
651                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
652                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
653                         if (!options)
654                                 return log_oom();
655
656                         o = options;
657                 } else
658 #endif
659                         o = mount_table[k].options;
660
661
662                 if (mount(mount_table[k].what,
663                           where,
664                           mount_table[k].type,
665                           mount_table[k].flags,
666                           o) < 0 &&
667                     mount_table[k].fatal) {
668
669                         log_error("mount(%s) failed: %m", where);
670
671                         if (r == 0)
672                                 r = -errno;
673                 }
674         }
675
676         return r;
677 }
678
679 static int mount_binds(const char *dest, char **l, bool ro) {
680         char **x, **y;
681
682         STRV_FOREACH_PAIR(x, y, l) {
683                 _cleanup_free_ char *where = NULL;
684                 struct stat source_st, dest_st;
685                 int r;
686
687                 if (stat(*x, &source_st) < 0) {
688                         log_error("Failed to stat %s: %m", *x);
689                         return -errno;
690                 }
691
692                 where = strappend(dest, *y);
693                 if (!where)
694                         return log_oom();
695
696                 r = stat(where, &dest_st);
697                 if (r == 0) {
698                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
699                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
700                                 return -EINVAL;
701                         }
702                 } else if (errno == ENOENT) {
703                         r = mkdir_parents_label(where, 0755);
704                         if (r < 0) {
705                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
706                                 return r;
707                         }
708                 } else {
709                         log_error("Failed to bind mount %s: %m", *x);
710                         return -errno;
711                 }
712
713                 /* Create the mount point, but be conservative -- refuse to create block
714                 * and char devices. */
715                 if (S_ISDIR(source_st.st_mode))
716                         mkdir_label(where, 0755);
717                 else if (S_ISFIFO(source_st.st_mode))
718                         mkfifo(where, 0644);
719                 else if (S_ISSOCK(source_st.st_mode))
720                         mknod(where, 0644 | S_IFSOCK, 0);
721                 else if (S_ISREG(source_st.st_mode))
722                         touch(where);
723                 else {
724                         log_error("Refusing to create mountpoint for file: %s", *x);
725                         return -ENOTSUP;
726                 }
727
728                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
729                         log_error("mount(%s) failed: %m", where);
730                         return -errno;
731                 }
732
733                 if (ro) {
734                         r = bind_remount_recursive(where, true);
735                         if (r < 0) {
736                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
737                                 return r;
738                         }
739                 }
740         }
741
742         return 0;
743 }
744
745 static int mount_tmpfs(const char *dest) {
746         char **i, **o;
747
748         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
749                 _cleanup_free_ char *where = NULL;
750
751                 where = strappend(dest, *i);
752                 if (!where)
753                         return log_oom();
754
755                 mkdir_label(where, 0755);
756
757                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
758                         log_error("tmpfs mount to %s failed: %m", where);
759                         return -errno;
760                 }
761         }
762
763         return 0;
764 }
765
766 static int setup_timezone(const char *dest) {
767         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
768         char *z, *y;
769         int r;
770
771         assert(dest);
772
773         /* Fix the timezone, if possible */
774         r = readlink_malloc("/etc/localtime", &p);
775         if (r < 0) {
776                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
777                 return 0;
778         }
779
780         z = path_startswith(p, "../usr/share/zoneinfo/");
781         if (!z)
782                 z = path_startswith(p, "/usr/share/zoneinfo/");
783         if (!z) {
784                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
785                 return 0;
786         }
787
788         where = strappend(dest, "/etc/localtime");
789         if (!where)
790                 return log_oom();
791
792         r = readlink_malloc(where, &q);
793         if (r >= 0) {
794                 y = path_startswith(q, "../usr/share/zoneinfo/");
795                 if (!y)
796                         y = path_startswith(q, "/usr/share/zoneinfo/");
797
798
799                 /* Already pointing to the right place? Then do nothing .. */
800                 if (y && streq(y, z))
801                         return 0;
802         }
803
804         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
805         if (!check)
806                 return log_oom();
807
808         if (access(check, F_OK) < 0) {
809                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
810                 return 0;
811         }
812
813         what = strappend("../usr/share/zoneinfo/", z);
814         if (!what)
815                 return log_oom();
816
817         unlink(where);
818         if (symlink(what, where) < 0) {
819                 log_error("Failed to correct timezone of container: %m");
820                 return 0;
821         }
822
823         return 0;
824 }
825
826 static int setup_resolv_conf(const char *dest) {
827         char _cleanup_free_ *where = NULL;
828
829         assert(dest);
830
831         if (arg_private_network)
832                 return 0;
833
834         /* Fix resolv.conf, if possible */
835         where = strappend(dest, "/etc/resolv.conf");
836         if (!where)
837                 return log_oom();
838
839         /* We don't really care for the results of this really. If it
840          * fails, it fails, but meh... */
841         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
842
843         return 0;
844 }
845
846 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
847
848         snprintf(s, 37,
849                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
850                  SD_ID128_FORMAT_VAL(id));
851
852         return s;
853 }
854
855 static int setup_boot_id(const char *dest) {
856         _cleanup_free_ char *from = NULL, *to = NULL;
857         sd_id128_t rnd = {};
858         char as_uuid[37];
859         int r;
860
861         assert(dest);
862
863         if (arg_share_system)
864                 return 0;
865
866         /* Generate a new randomized boot ID, so that each boot-up of
867          * the container gets a new one */
868
869         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
870         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
871         if (!from || !to)
872                 return log_oom();
873
874         r = sd_id128_randomize(&rnd);
875         if (r < 0) {
876                 log_error("Failed to generate random boot id: %s", strerror(-r));
877                 return r;
878         }
879
880         id128_format_as_uuid(rnd, as_uuid);
881
882         r = write_string_file(from, as_uuid);
883         if (r < 0) {
884                 log_error("Failed to write boot id: %s", strerror(-r));
885                 return r;
886         }
887
888         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
889                 log_error("Failed to bind mount boot id: %m");
890                 r = -errno;
891         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
892                 log_warning("Failed to make boot id read-only: %m");
893
894         unlink(from);
895         return r;
896 }
897
898 static int copy_devnodes(const char *dest) {
899
900         static const char devnodes[] =
901                 "null\0"
902                 "zero\0"
903                 "full\0"
904                 "random\0"
905                 "urandom\0"
906                 "tty\0";
907
908         const char *d;
909         int r = 0;
910         _cleanup_umask_ mode_t u;
911
912         assert(dest);
913
914         u = umask(0000);
915
916         NULSTR_FOREACH(d, devnodes) {
917                 _cleanup_free_ char *from = NULL, *to = NULL;
918                 struct stat st;
919
920                 from = strappend("/dev/", d);
921                 to = strjoin(dest, "/dev/", d, NULL);
922                 if (!from || !to)
923                         return log_oom();
924
925                 if (stat(from, &st) < 0) {
926
927                         if (errno != ENOENT) {
928                                 log_error("Failed to stat %s: %m", from);
929                                 return -errno;
930                         }
931
932                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
933
934                         log_error("%s is not a char or block device, cannot copy", from);
935                         return -EIO;
936
937                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
938
939                         log_error("mknod(%s) failed: %m", dest);
940                         return  -errno;
941                 }
942         }
943
944         return r;
945 }
946
947 static int setup_ptmx(const char *dest) {
948         _cleanup_free_ char *p = NULL;
949
950         p = strappend(dest, "/dev/ptmx");
951         if (!p)
952                 return log_oom();
953
954         if (symlink("pts/ptmx", p) < 0) {
955                 log_error("Failed to create /dev/ptmx symlink: %m");
956                 return -errno;
957         }
958
959         return 0;
960 }
961
962 static int setup_dev_console(const char *dest, const char *console) {
963         _cleanup_umask_ mode_t u;
964         const char *to;
965         struct stat st;
966         int r;
967
968         assert(dest);
969         assert(console);
970
971         u = umask(0000);
972
973         if (stat("/dev/null", &st) < 0) {
974                 log_error("Failed to stat /dev/null: %m");
975                 return -errno;
976         }
977
978         r = chmod_and_chown(console, 0600, 0, 0);
979         if (r < 0) {
980                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
981                 return r;
982         }
983
984         /* We need to bind mount the right tty to /dev/console since
985          * ptys can only exist on pts file systems. To have something
986          * to bind mount things on we create a device node first, and
987          * use /dev/null for that since we the cgroups device policy
988          * allows us to create that freely, while we cannot create
989          * /dev/console. (Note that the major minor doesn't actually
990          * matter here, since we mount it over anyway). */
991
992         to = strappenda(dest, "/dev/console");
993         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
994                 log_error("mknod() for /dev/console failed: %m");
995                 return -errno;
996         }
997
998         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
999                 log_error("Bind mount for /dev/console failed: %m");
1000                 return -errno;
1001         }
1002
1003         return 0;
1004 }
1005
1006 static int setup_kmsg(const char *dest, int kmsg_socket) {
1007         _cleanup_free_ char *from = NULL, *to = NULL;
1008         int r, fd, k;
1009         _cleanup_umask_ mode_t u;
1010         union {
1011                 struct cmsghdr cmsghdr;
1012                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1013         } control = {};
1014         struct msghdr mh = {
1015                 .msg_control = &control,
1016                 .msg_controllen = sizeof(control),
1017         };
1018         struct cmsghdr *cmsg;
1019
1020         assert(dest);
1021         assert(kmsg_socket >= 0);
1022
1023         u = umask(0000);
1024
1025         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1026          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1027          * on the reading side behave very similar to /proc/kmsg,
1028          * their writing side behaves differently from /dev/kmsg in
1029          * that writing blocks when nothing is reading. In order to
1030          * avoid any problems with containers deadlocking due to this
1031          * we simply make /dev/kmsg unavailable to the container. */
1032         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1033             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1034                 return log_oom();
1035
1036         if (mkfifo(from, 0600) < 0) {
1037                 log_error("mkfifo() for /dev/kmsg failed: %m");
1038                 return -errno;
1039         }
1040
1041         r = chmod_and_chown(from, 0600, 0, 0);
1042         if (r < 0) {
1043                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1044                 return r;
1045         }
1046
1047         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1048                 log_error("Bind mount for /proc/kmsg failed: %m");
1049                 return -errno;
1050         }
1051
1052         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1053         if (fd < 0) {
1054                 log_error("Failed to open fifo: %m");
1055                 return -errno;
1056         }
1057
1058         cmsg = CMSG_FIRSTHDR(&mh);
1059         cmsg->cmsg_level = SOL_SOCKET;
1060         cmsg->cmsg_type = SCM_RIGHTS;
1061         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1062         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1063
1064         mh.msg_controllen = cmsg->cmsg_len;
1065
1066         /* Store away the fd in the socket, so that it stays open as
1067          * long as we run the child */
1068         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1069         safe_close(fd);
1070
1071         if (k < 0) {
1072                 log_error("Failed to send FIFO fd: %m");
1073                 return -errno;
1074         }
1075
1076         /* And now make the FIFO unavailable as /dev/kmsg... */
1077         unlink(from);
1078         return 0;
1079 }
1080
1081 static int setup_hostname(void) {
1082
1083         if (arg_share_system)
1084                 return 0;
1085
1086         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1087                 return -errno;
1088
1089         return 0;
1090 }
1091
1092 static int setup_journal(const char *directory) {
1093         sd_id128_t machine_id, this_id;
1094         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1095         char *id;
1096         int r;
1097
1098         p = strappend(directory, "/etc/machine-id");
1099         if (!p)
1100                 return log_oom();
1101
1102         r = read_one_line_file(p, &b);
1103         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1104                 return 0;
1105         else if (r < 0) {
1106                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1107                 return r;
1108         }
1109
1110         id = strstrip(b);
1111         if (isempty(id) && arg_link_journal == LINK_AUTO)
1112                 return 0;
1113
1114         /* Verify validity */
1115         r = sd_id128_from_string(id, &machine_id);
1116         if (r < 0) {
1117                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1118                 return r;
1119         }
1120
1121         r = sd_id128_get_machine(&this_id);
1122         if (r < 0) {
1123                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1124                 return r;
1125         }
1126
1127         if (sd_id128_equal(machine_id, this_id)) {
1128                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1129                          "Host and machine ids are equal (%s): refusing to link journals", id);
1130                 if (arg_link_journal == LINK_AUTO)
1131                         return 0;
1132                 return
1133                         -EEXIST;
1134         }
1135
1136         if (arg_link_journal == LINK_NO)
1137                 return 0;
1138
1139         free(p);
1140         p = strappend("/var/log/journal/", id);
1141         q = strjoin(directory, "/var/log/journal/", id, NULL);
1142         if (!p || !q)
1143                 return log_oom();
1144
1145         if (path_is_mount_point(p, false) > 0) {
1146                 if (arg_link_journal != LINK_AUTO) {
1147                         log_error("%s: already a mount point, refusing to use for journal", p);
1148                         return -EEXIST;
1149                 }
1150
1151                 return 0;
1152         }
1153
1154         if (path_is_mount_point(q, false) > 0) {
1155                 if (arg_link_journal != LINK_AUTO) {
1156                         log_error("%s: already a mount point, refusing to use for journal", q);
1157                         return -EEXIST;
1158                 }
1159
1160                 return 0;
1161         }
1162
1163         r = readlink_and_make_absolute(p, &d);
1164         if (r >= 0) {
1165                 if ((arg_link_journal == LINK_GUEST ||
1166                      arg_link_journal == LINK_AUTO) &&
1167                     path_equal(d, q)) {
1168
1169                         r = mkdir_p(q, 0755);
1170                         if (r < 0)
1171                                 log_warning("failed to create directory %s: %m", q);
1172                         return 0;
1173                 }
1174
1175                 if (unlink(p) < 0) {
1176                         log_error("Failed to remove symlink %s: %m", p);
1177                         return -errno;
1178                 }
1179         } else if (r == -EINVAL) {
1180
1181                 if (arg_link_journal == LINK_GUEST &&
1182                     rmdir(p) < 0) {
1183
1184                         if (errno == ENOTDIR) {
1185                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1186                                 return r;
1187                         } else {
1188                                 log_error("Failed to remove %s: %m", p);
1189                                 return -errno;
1190                         }
1191                 }
1192         } else if (r != -ENOENT) {
1193                 log_error("readlink(%s) failed: %m", p);
1194                 return r;
1195         }
1196
1197         if (arg_link_journal == LINK_GUEST) {
1198
1199                 if (symlink(q, p) < 0) {
1200                         log_error("Failed to symlink %s to %s: %m", q, p);
1201                         return -errno;
1202                 }
1203
1204                 r = mkdir_p(q, 0755);
1205                 if (r < 0)
1206                         log_warning("failed to create directory %s: %m", q);
1207                 return 0;
1208         }
1209
1210         if (arg_link_journal == LINK_HOST) {
1211                 r = mkdir_p(p, 0755);
1212                 if (r < 0) {
1213                         log_error("Failed to create %s: %m", p);
1214                         return r;
1215                 }
1216
1217         } else if (access(p, F_OK) < 0)
1218                 return 0;
1219
1220         if (dir_is_empty(q) == 0)
1221                 log_warning("%s is not empty, proceeding anyway.", q);
1222
1223         r = mkdir_p(q, 0755);
1224         if (r < 0) {
1225                 log_error("Failed to create %s: %m", q);
1226                 return r;
1227         }
1228
1229         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1230                 log_error("Failed to bind mount journal from host into guest: %m");
1231                 return -errno;
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int setup_kdbus(const char *dest, const char *path) {
1238         const char *p;
1239
1240         if (!path)
1241                 return 0;
1242
1243         p = strappenda(dest, "/dev/kdbus");
1244         if (mkdir(p, 0755) < 0) {
1245                 log_error("Failed to create kdbus path: %m");
1246                 return  -errno;
1247         }
1248
1249         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1250                 log_error("Failed to mount kdbus domain path: %m");
1251                 return -errno;
1252         }
1253
1254         return 0;
1255 }
1256
1257 static int drop_capabilities(void) {
1258         return capability_bounding_set_drop(~arg_retain, false);
1259 }
1260
1261 static int register_machine(pid_t pid) {
1262         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1263         _cleanup_bus_unref_ sd_bus *bus = NULL;
1264         int r;
1265
1266         if (!arg_register)
1267                 return 0;
1268
1269         r = sd_bus_default_system(&bus);
1270         if (r < 0) {
1271                 log_error("Failed to open system bus: %s", strerror(-r));
1272                 return r;
1273         }
1274
1275         if (arg_keep_unit) {
1276                 r = sd_bus_call_method(
1277                                 bus,
1278                                 "org.freedesktop.machine1",
1279                                 "/org/freedesktop/machine1",
1280                                 "org.freedesktop.machine1.Manager",
1281                                 "RegisterMachine",
1282                                 &error,
1283                                 NULL,
1284                                 "sayssus",
1285                                 arg_machine,
1286                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1287                                 "nspawn",
1288                                 "container",
1289                                 (uint32_t) pid,
1290                                 strempty(arg_directory));
1291         } else {
1292                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1293
1294                 r = sd_bus_message_new_method_call(
1295                                 bus,
1296                                 &m,
1297                                 "org.freedesktop.machine1",
1298                                 "/org/freedesktop/machine1",
1299                                 "org.freedesktop.machine1.Manager",
1300                                 "CreateMachine");
1301                 if (r < 0) {
1302                         log_error("Failed to create message: %s", strerror(-r));
1303                         return r;
1304                 }
1305
1306                 r = sd_bus_message_append(
1307                                 m,
1308                                 "sayssus",
1309                                 arg_machine,
1310                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1311                                 "nspawn",
1312                                 "container",
1313                                 (uint32_t) pid,
1314                                 strempty(arg_directory));
1315                 if (r < 0) {
1316                         log_error("Failed to append message arguments: %s", strerror(-r));
1317                         return r;
1318                 }
1319
1320                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1321                 if (r < 0) {
1322                         log_error("Failed to open container: %s", strerror(-r));
1323                         return r;
1324                 }
1325
1326                 if (!isempty(arg_slice)) {
1327                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1328                         if (r < 0) {
1329                                 log_error("Failed to append slice: %s", strerror(-r));
1330                                 return r;
1331                         }
1332                 }
1333
1334                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1335                 if (r < 0) {
1336                         log_error("Failed to add device policy: %s", strerror(-r));
1337                         return r;
1338                 }
1339
1340                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1341                                           /* Allow the container to
1342                                            * access and create the API
1343                                            * device nodes, so that
1344                                            * PrivateDevices= in the
1345                                            * container can work
1346                                            * fine */
1347                                           "/dev/null", "rwm",
1348                                           "/dev/zero", "rwm",
1349                                           "/dev/full", "rwm",
1350                                           "/dev/random", "rwm",
1351                                           "/dev/urandom", "rwm",
1352                                           "/dev/tty", "rwm",
1353                                           /* Allow the container
1354                                            * access to ptys. However,
1355                                            * do not permit the
1356                                            * container to ever create
1357                                            * these device nodes. */
1358                                           "/dev/pts/ptmx", "rw",
1359                                           "char-pts", "rw",
1360                                           /* Allow the container
1361                                            * access to all kdbus
1362                                            * devices. Again, the
1363                                            * container cannot create
1364                                            * these nodes, only use
1365                                            * them. We use a pretty
1366                                            * open match here, so that
1367                                            * the kernel API can still
1368                                            * change. */
1369                                           "char-kdbus", "rw",
1370                                           "char-kdbus/*", "rw");
1371                 if (r < 0) {
1372                         log_error("Failed to add device whitelist: %s", strerror(-r));
1373                         return r;
1374                 }
1375
1376                 r = sd_bus_message_close_container(m);
1377                 if (r < 0) {
1378                         log_error("Failed to close container: %s", strerror(-r));
1379                         return r;
1380                 }
1381
1382                 r = sd_bus_call(bus, m, 0, &error, NULL);
1383         }
1384
1385         if (r < 0) {
1386                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1387                 return r;
1388         }
1389
1390         return 0;
1391 }
1392
1393 static int terminate_machine(pid_t pid) {
1394         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1395         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1396         _cleanup_bus_unref_ sd_bus *bus = NULL;
1397         const char *path;
1398         int r;
1399
1400         if (!arg_register)
1401                 return 0;
1402
1403         r = sd_bus_default_system(&bus);
1404         if (r < 0) {
1405                 log_error("Failed to open system bus: %s", strerror(-r));
1406                 return r;
1407         }
1408
1409         r = sd_bus_call_method(
1410                         bus,
1411                         "org.freedesktop.machine1",
1412                         "/org/freedesktop/machine1",
1413                         "org.freedesktop.machine1.Manager",
1414                         "GetMachineByPID",
1415                         &error,
1416                         &reply,
1417                         "u",
1418                         (uint32_t) pid);
1419         if (r < 0) {
1420                 /* Note that the machine might already have been
1421                  * cleaned up automatically, hence don't consider it a
1422                  * failure if we cannot get the machine object. */
1423                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1424                 return 0;
1425         }
1426
1427         r = sd_bus_message_read(reply, "o", &path);
1428         if (r < 0)
1429                 return bus_log_parse_error(r);
1430
1431         r = sd_bus_call_method(
1432                         bus,
1433                         "org.freedesktop.machine1",
1434                         path,
1435                         "org.freedesktop.machine1.Machine",
1436                         "Terminate",
1437                         &error,
1438                         NULL,
1439                         NULL);
1440         if (r < 0) {
1441                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1442                 return 0;
1443         }
1444
1445         return 0;
1446 }
1447
1448 static int reset_audit_loginuid(void) {
1449         _cleanup_free_ char *p = NULL;
1450         int r;
1451
1452         if (arg_share_system)
1453                 return 0;
1454
1455         r = read_one_line_file("/proc/self/loginuid", &p);
1456         if (r == -ENOENT)
1457                 return 0;
1458         if (r < 0) {
1459                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1460                 return r;
1461         }
1462
1463         /* Already reset? */
1464         if (streq(p, "4294967295"))
1465                 return 0;
1466
1467         r = write_string_file("/proc/self/loginuid", "4294967295");
1468         if (r < 0) {
1469                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1470                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1471                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1472                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1473                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1474
1475                 sleep(5);
1476         }
1477
1478         return 0;
1479 }
1480
1481 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1482
1483 static int get_mac(struct ether_addr *mac) {
1484         int r;
1485
1486         uint8_t result[8];
1487         size_t l, sz;
1488         uint8_t *v;
1489
1490         l = strlen(arg_machine);
1491         sz = sizeof(sd_id128_t) + l;
1492         v = alloca(sz);
1493
1494         /* fetch some persistent data unique to the host */
1495         r = sd_id128_get_machine((sd_id128_t*) v);
1496         if (r < 0)
1497                 return r;
1498
1499         /* combine with some data unique (on this host) to this
1500          * container instance */
1501         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1502
1503         /* Let's hash the host machine ID plus the container name. We
1504          * use a fixed, but originally randomly created hash key here. */
1505         siphash24(result, v, sz, HASH_KEY.bytes);
1506
1507         assert_cc(ETH_ALEN <= sizeof(result));
1508         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1509
1510         /* see eth_random_addr in the kernel */
1511         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1512         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1513
1514         return 0;
1515 }
1516
1517 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1518         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1519         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1520         struct ether_addr mac;
1521         int r;
1522
1523         if (!arg_private_network)
1524                 return 0;
1525
1526         if (!arg_network_veth)
1527                 return 0;
1528
1529         /* Use two different interface name prefixes depending whether
1530          * we are in bridge mode or not. */
1531         if (arg_network_bridge)
1532                 memcpy(iface_name, "vb-", 3);
1533         else
1534                 memcpy(iface_name, "ve-", 3);
1535         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1536
1537         r = get_mac(&mac);
1538         if (r < 0) {
1539                 log_error("Failed to generate predictable MAC address for host0");
1540                 return r;
1541         }
1542
1543         r = sd_rtnl_open(&rtnl, 0);
1544         if (r < 0) {
1545                 log_error("Failed to connect to netlink: %s", strerror(-r));
1546                 return r;
1547         }
1548
1549         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1550         if (r < 0) {
1551                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1552                 return r;
1553         }
1554
1555         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1556         if (r < 0) {
1557                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1558                 return r;
1559         }
1560
1561         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1562         if (r < 0) {
1563                 log_error("Failed to open netlink container: %s", strerror(-r));
1564                 return r;
1565         }
1566
1567         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1568         if (r < 0) {
1569                 log_error("Failed to open netlink container: %s", strerror(-r));
1570                 return r;
1571         }
1572
1573         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1574         if (r < 0) {
1575                 log_error("Failed to open netlink container: %s", strerror(-r));
1576                 return r;
1577         }
1578
1579         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1580         if (r < 0) {
1581                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1582                 return r;
1583         }
1584
1585         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1586         if (r < 0) {
1587                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1588                 return r;
1589         }
1590
1591         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1592         if (r < 0) {
1593                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1594                 return r;
1595         }
1596
1597         r = sd_rtnl_message_close_container(m);
1598         if (r < 0) {
1599                 log_error("Failed to close netlink container: %s", strerror(-r));
1600                 return r;
1601         }
1602
1603         r = sd_rtnl_message_close_container(m);
1604         if (r < 0) {
1605                 log_error("Failed to close netlink container: %s", strerror(-r));
1606                 return r;
1607         }
1608
1609         r = sd_rtnl_message_close_container(m);
1610         if (r < 0) {
1611                 log_error("Failed to close netlink container: %s", strerror(-r));
1612                 return r;
1613         }
1614
1615         r = sd_rtnl_call(rtnl, m, 0, NULL);
1616         if (r < 0) {
1617                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1618                 return r;
1619         }
1620
1621         return 0;
1622 }
1623
1624 static int setup_bridge(const char veth_name[]) {
1625         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1626         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1627         int r, bridge;
1628
1629         if (!arg_private_network)
1630                 return 0;
1631
1632         if (!arg_network_veth)
1633                 return 0;
1634
1635         if (!arg_network_bridge)
1636                 return 0;
1637
1638         bridge = (int) if_nametoindex(arg_network_bridge);
1639         if (bridge <= 0) {
1640                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1641                 return -errno;
1642         }
1643
1644         r = sd_rtnl_open(&rtnl, 0);
1645         if (r < 0) {
1646                 log_error("Failed to connect to netlink: %s", strerror(-r));
1647                 return r;
1648         }
1649
1650         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1651         if (r < 0) {
1652                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1653                 return r;
1654         }
1655
1656         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1657         if (r < 0) {
1658                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1659                 return r;
1660         }
1661
1662         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1663         if (r < 0) {
1664                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1665                 return r;
1666         }
1667
1668         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1669         if (r < 0) {
1670                 log_error("Failed to add netlink master field: %s", strerror(-r));
1671                 return r;
1672         }
1673
1674         r = sd_rtnl_call(rtnl, m, 0, NULL);
1675         if (r < 0) {
1676                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1677                 return r;
1678         }
1679
1680         return 0;
1681 }
1682
1683 static int parse_interface(struct udev *udev, const char *name) {
1684         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1685         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1686         int ifi;
1687
1688         ifi = (int) if_nametoindex(name);
1689         if (ifi <= 0) {
1690                 log_error("Failed to resolve interface %s: %m", name);
1691                 return -errno;
1692         }
1693
1694         sprintf(ifi_str, "n%i", ifi);
1695         d = udev_device_new_from_device_id(udev, ifi_str);
1696         if (!d) {
1697                 log_error("Failed to get udev device for interface %s: %m", name);
1698                 return -errno;
1699         }
1700
1701         if (udev_device_get_is_initialized(d) <= 0) {
1702                 log_error("Network interface %s is not initialized yet.", name);
1703                 return -EBUSY;
1704         }
1705
1706         return ifi;
1707 }
1708
1709 static int move_network_interfaces(pid_t pid) {
1710         _cleanup_udev_unref_ struct udev *udev = NULL;
1711         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1712         char **i;
1713         int r;
1714
1715         if (!arg_private_network)
1716                 return 0;
1717
1718         if (strv_isempty(arg_network_interfaces))
1719                 return 0;
1720
1721         r = sd_rtnl_open(&rtnl, 0);
1722         if (r < 0) {
1723                 log_error("Failed to connect to netlink: %s", strerror(-r));
1724                 return r;
1725         }
1726
1727         udev = udev_new();
1728         if (!udev) {
1729                 log_error("Failed to connect to udev.");
1730                 return -ENOMEM;
1731         }
1732
1733         STRV_FOREACH(i, arg_network_interfaces) {
1734                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1735                 int ifi;
1736
1737                 ifi = parse_interface(udev, *i);
1738                 if (ifi < 0)
1739                         return ifi;
1740
1741                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1742                 if (r < 0) {
1743                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1744                         return r;
1745                 }
1746
1747                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1748                 if (r < 0) {
1749                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1750                         return r;
1751                 }
1752
1753                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1754                 if (r < 0) {
1755                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1756                         return r;
1757                 }
1758         }
1759
1760         return 0;
1761 }
1762
1763 static int setup_macvlan(pid_t pid) {
1764         _cleanup_udev_unref_ struct udev *udev = NULL;
1765         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1766         char **i;
1767         int r;
1768
1769         if (!arg_private_network)
1770                 return 0;
1771
1772         if (strv_isempty(arg_network_macvlan))
1773                 return 0;
1774
1775         r = sd_rtnl_open(&rtnl, 0);
1776         if (r < 0) {
1777                 log_error("Failed to connect to netlink: %s", strerror(-r));
1778                 return r;
1779         }
1780
1781         udev = udev_new();
1782         if (!udev) {
1783                 log_error("Failed to connect to udev.");
1784                 return -ENOMEM;
1785         }
1786
1787         STRV_FOREACH(i, arg_network_macvlan) {
1788                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1789                 _cleanup_free_ char *n = NULL;
1790                 int ifi;
1791
1792                 ifi = parse_interface(udev, *i);
1793                 if (ifi < 0)
1794                         return ifi;
1795
1796                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1797                 if (r < 0) {
1798                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1799                         return r;
1800                 }
1801
1802                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1803                 if (r < 0) {
1804                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1805                         return r;
1806                 }
1807
1808                 n = strappend("mv-", *i);
1809                 if (!n)
1810                         return log_oom();
1811
1812                 strshorten(n, IFNAMSIZ-1);
1813
1814                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1815                 if (r < 0) {
1816                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1817                         return r;
1818                 }
1819
1820                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1821                 if (r < 0) {
1822                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1823                         return r;
1824                 }
1825
1826                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1827                 if (r < 0) {
1828                         log_error("Failed to open netlink container: %s", strerror(-r));
1829                         return r;
1830                 }
1831
1832                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1833                 if (r < 0) {
1834                         log_error("Failed to open netlink container: %s", strerror(-r));
1835                         return r;
1836                 }
1837
1838                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1839                 if (r < 0) {
1840                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1841                         return r;
1842                 }
1843
1844                 r = sd_rtnl_message_close_container(m);
1845                 if (r < 0) {
1846                         log_error("Failed to close netlink container: %s", strerror(-r));
1847                         return r;
1848                 }
1849
1850                 r = sd_rtnl_message_close_container(m);
1851                 if (r < 0) {
1852                         log_error("Failed to close netlink container: %s", strerror(-r));
1853                         return r;
1854                 }
1855
1856                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1857                 if (r < 0) {
1858                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1859                         return r;
1860                 }
1861         }
1862
1863         return 0;
1864 }
1865
1866 static int audit_still_doesnt_work_in_containers(void) {
1867
1868 #ifdef HAVE_SECCOMP
1869         scmp_filter_ctx seccomp;
1870         int r;
1871
1872         /*
1873            Audit is broken in containers, much of the userspace audit
1874            hookup will fail if running inside a container. We don't
1875            care and just turn off creation of audit sockets.
1876
1877            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1878            with EAFNOSUPPORT which audit userspace uses as indication
1879            that audit is disabled in the kernel.
1880          */
1881
1882         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1883         if (!seccomp)
1884                 return log_oom();
1885
1886         r = seccomp_add_secondary_archs(seccomp);
1887         if (r < 0) {
1888                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1889                 goto finish;
1890         }
1891
1892         r = seccomp_rule_add(
1893                         seccomp,
1894                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1895                         SCMP_SYS(socket),
1896                         2,
1897                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1898                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1899         if (r < 0) {
1900                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1901                 goto finish;
1902         }
1903
1904         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1905         if (r < 0) {
1906                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1907                 goto finish;
1908         }
1909
1910         r = seccomp_load(seccomp);
1911         if (r < 0)
1912                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1913
1914 finish:
1915         seccomp_release(seccomp);
1916         return r;
1917 #else
1918         return 0;
1919 #endif
1920
1921 }
1922
1923 static int setup_image(char **device_path, int *loop_nr) {
1924         struct loop_info64 info = {
1925                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1926         };
1927         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1928         _cleanup_free_ char* loopdev = NULL;
1929         struct stat st;
1930         int r, nr;
1931
1932         assert(device_path);
1933         assert(loop_nr);
1934
1935         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1936         if (fd < 0) {
1937                 log_error("Failed to open %s: %m", arg_image);
1938                 return -errno;
1939         }
1940
1941         if (fstat(fd, &st) < 0) {
1942                 log_error("Failed to stat %s: %m", arg_image);
1943                 return -errno;
1944         }
1945
1946         if (S_ISBLK(st.st_mode)) {
1947                 char *p;
1948
1949                 p = strdup(arg_image);
1950                 if (!p)
1951                         return log_oom();
1952
1953                 *device_path = p;
1954
1955                 *loop_nr = -1;
1956
1957                 r = fd;
1958                 fd = -1;
1959
1960                 return r;
1961         }
1962
1963         if (!S_ISREG(st.st_mode)) {
1964                 log_error("%s is not a regular file or block device: %m", arg_image);
1965                 return -EINVAL;
1966         }
1967
1968         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1969         if (control < 0) {
1970                 log_error("Failed to open /dev/loop-control: %m");
1971                 return -errno;
1972         }
1973
1974         nr = ioctl(control, LOOP_CTL_GET_FREE);
1975         if (nr < 0) {
1976                 log_error("Failed to allocate loop device: %m");
1977                 return -errno;
1978         }
1979
1980         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1981                 return log_oom();
1982
1983         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1984         if (loop < 0) {
1985                 log_error("Failed to open loop device %s: %m", loopdev);
1986                 return -errno;
1987         }
1988
1989         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1990                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1991                 return -errno;
1992         }
1993
1994         if (arg_read_only)
1995                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1996
1997         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1998                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1999                 return -errno;
2000         }
2001
2002         *device_path = loopdev;
2003         loopdev = NULL;
2004
2005         *loop_nr = nr;
2006
2007         r = loop;
2008         loop = -1;
2009
2010         return r;
2011 }
2012
2013 static int dissect_image(
2014                 int fd,
2015                 char **root_device, bool *root_device_rw,
2016                 char **home_device, bool *home_device_rw,
2017                 char **srv_device, bool *srv_device_rw,
2018                 bool *secondary) {
2019
2020 #ifdef HAVE_BLKID
2021         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2022         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2023         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2024         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2025         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2026         _cleanup_udev_unref_ struct udev *udev = NULL;
2027         struct udev_list_entry *first, *item;
2028         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2029         const char *pttype = NULL;
2030         blkid_partlist pl;
2031         struct stat st;
2032         int r;
2033
2034         assert(fd >= 0);
2035         assert(root_device);
2036         assert(home_device);
2037         assert(srv_device);
2038         assert(secondary);
2039
2040         b = blkid_new_probe();
2041         if (!b)
2042                 return log_oom();
2043
2044         errno = 0;
2045         r = blkid_probe_set_device(b, fd, 0, 0);
2046         if (r != 0) {
2047                 if (errno == 0)
2048                         return log_oom();
2049
2050                 log_error("Failed to set device on blkid probe: %m");
2051                 return -errno;
2052         }
2053
2054         blkid_probe_enable_partitions(b, 1);
2055         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2056
2057         errno = 0;
2058         r = blkid_do_safeprobe(b);
2059         if (r == -2 || r == 1) {
2060                 log_error("Failed to identify any partition table on %s.\n"
2061                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2062                 return -EINVAL;
2063         } else if (r != 0) {
2064                 if (errno == 0)
2065                         errno = EIO;
2066                 log_error("Failed to probe: %m");
2067                 return -errno;
2068         }
2069
2070         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2071         if (!streq_ptr(pttype, "gpt")) {
2072                 log_error("Image %s does not carry a GUID Partition Table.\n"
2073                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2074                 return -EINVAL;
2075         }
2076
2077         errno = 0;
2078         pl = blkid_probe_get_partitions(b);
2079         if (!pl) {
2080                 if (errno == 0)
2081                         return log_oom();
2082
2083                 log_error("Failed to list partitions of %s", arg_image);
2084                 return -errno;
2085         }
2086
2087         udev = udev_new();
2088         if (!udev)
2089                 return log_oom();
2090
2091         if (fstat(fd, &st) < 0) {
2092                 log_error("Failed to stat block device: %m");
2093                 return -errno;
2094         }
2095
2096         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2097         if (!d)
2098                 return log_oom();
2099
2100         e = udev_enumerate_new(udev);
2101         if (!e)
2102                 return log_oom();
2103
2104         r = udev_enumerate_add_match_parent(e, d);
2105         if (r < 0)
2106                 return log_oom();
2107
2108         r = udev_enumerate_scan_devices(e);
2109         if (r < 0) {
2110                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2111                 return r;
2112         }
2113
2114         first = udev_enumerate_get_list_entry(e);
2115         udev_list_entry_foreach(item, first) {
2116                 _cleanup_udev_device_unref_ struct udev_device *q;
2117                 const char *stype, *node;
2118                 unsigned long long flags;
2119                 sd_id128_t type_id;
2120                 blkid_partition pp;
2121                 dev_t qn;
2122                 int nr;
2123
2124                 errno = 0;
2125                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2126                 if (!q) {
2127                         if (!errno)
2128                                 errno = ENOMEM;
2129
2130                         log_error("Failed to get partition device of %s: %m", arg_image);
2131                         return -errno;
2132                 }
2133
2134                 qn = udev_device_get_devnum(q);
2135                 if (major(qn) == 0)
2136                         continue;
2137
2138                 if (st.st_rdev == qn)
2139                         continue;
2140
2141                 node = udev_device_get_devnode(q);
2142                 if (!node)
2143                         continue;
2144
2145                 pp = blkid_partlist_devno_to_partition(pl, qn);
2146                 if (!pp)
2147                         continue;
2148
2149                 flags = blkid_partition_get_flags(pp);
2150                 if (flags & GPT_FLAG_NO_AUTO)
2151                         continue;
2152
2153                 nr = blkid_partition_get_partno(pp);
2154                 if (nr < 0)
2155                         continue;
2156
2157                 stype = blkid_partition_get_type_string(pp);
2158                 if (!stype)
2159                         continue;
2160
2161                 if (sd_id128_from_string(stype, &type_id) < 0)
2162                         continue;
2163
2164                 if (sd_id128_equal(type_id, GPT_HOME)) {
2165
2166                         if (home && nr >= home_nr)
2167                                 continue;
2168
2169                         home_nr = nr;
2170                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2171
2172                         free(home);
2173                         home = strdup(node);
2174                         if (!home)
2175                                 return log_oom();
2176                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2177
2178                         if (srv && nr >= srv_nr)
2179                                 continue;
2180
2181                         srv_nr = nr;
2182                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2183
2184                         free(srv);
2185                         srv = strdup(node);
2186                         if (!srv)
2187                                 return log_oom();
2188                 }
2189 #ifdef GPT_ROOT_NATIVE
2190                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2191
2192                         if (root && nr >= root_nr)
2193                                 continue;
2194
2195                         root_nr = nr;
2196                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2197
2198                         free(root);
2199                         root = strdup(node);
2200                         if (!root)
2201                                 return log_oom();
2202                 }
2203 #endif
2204 #ifdef GPT_ROOT_SECONDARY
2205                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2206
2207                         if (secondary_root && nr >= secondary_root_nr)
2208                                 continue;
2209
2210                         secondary_root_nr = nr;
2211                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2212
2213
2214                         free(secondary_root);
2215                         secondary_root = strdup(node);
2216                         if (!secondary_root)
2217                                 return log_oom();
2218                 }
2219 #endif
2220         }
2221
2222         if (!root && !secondary_root) {
2223                 log_error("Failed to identify root partition in disk image %s.\n"
2224                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2225                 return -EINVAL;
2226         }
2227
2228         if (root) {
2229                 *root_device = root;
2230                 root = NULL;
2231
2232                 *root_device_rw = root_rw;
2233                 *secondary = false;
2234         } else if (secondary_root) {
2235                 *root_device = secondary_root;
2236                 secondary_root = NULL;
2237
2238                 *root_device_rw = secondary_root_rw;
2239                 *secondary = true;
2240         }
2241
2242         if (home) {
2243                 *home_device = home;
2244                 home = NULL;
2245
2246                 *home_device_rw = home_rw;
2247         }
2248
2249         if (srv) {
2250                 *srv_device = srv;
2251                 srv = NULL;
2252
2253                 *srv_device_rw = srv_rw;
2254         }
2255
2256         return 0;
2257 #else
2258         log_error("--image= is not supported, compiled without blkid support.");
2259         return -ENOTSUP;
2260 #endif
2261 }
2262
2263 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2264 #ifdef HAVE_BLKID
2265         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2266         const char *fstype, *p;
2267         int r;
2268
2269         assert(what);
2270         assert(where);
2271
2272         if (arg_read_only)
2273                 rw = false;
2274
2275         if (directory)
2276                 p = strappenda(where, directory);
2277         else
2278                 p = where;
2279
2280         errno = 0;
2281         b = blkid_new_probe_from_filename(what);
2282         if (!b) {
2283                 if (errno == 0)
2284                         return log_oom();
2285                 log_error("Failed to allocate prober for %s: %m", what);
2286                 return -errno;
2287         }
2288
2289         blkid_probe_enable_superblocks(b, 1);
2290         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2291
2292         errno = 0;
2293         r = blkid_do_safeprobe(b);
2294         if (r == -1 || r == 1) {
2295                 log_error("Cannot determine file system type of %s", what);
2296                 return -EINVAL;
2297         } else if (r != 0) {
2298                 if (errno == 0)
2299                         errno = EIO;
2300                 log_error("Failed to probe %s: %m", what);
2301                 return -errno;
2302         }
2303
2304         errno = 0;
2305         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2306                 if (errno == 0)
2307                         errno = EINVAL;
2308                 log_error("Failed to determine file system type of %s", what);
2309                 return -errno;
2310         }
2311
2312         if (streq(fstype, "crypto_LUKS")) {
2313                 log_error("nspawn currently does not support LUKS disk images.");
2314                 return -ENOTSUP;
2315         }
2316
2317         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2318                 log_error("Failed to mount %s: %m", what);
2319                 return -errno;
2320         }
2321
2322         return 0;
2323 #else
2324         log_error("--image= is not supported, compiled without blkid support.");
2325         return -ENOTSUP;
2326 #endif
2327 }
2328
2329 static int mount_devices(
2330                 const char *where,
2331                 const char *root_device, bool root_device_rw,
2332                 const char *home_device, bool home_device_rw,
2333                 const char *srv_device, bool srv_device_rw) {
2334         int r;
2335
2336         assert(where);
2337
2338         if (root_device) {
2339                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2340                 if (r < 0) {
2341                         log_error("Failed to mount root directory: %s", strerror(-r));
2342                         return r;
2343                 }
2344         }
2345
2346         if (home_device) {
2347                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2348                 if (r < 0) {
2349                         log_error("Failed to mount home directory: %s", strerror(-r));
2350                         return r;
2351                 }
2352         }
2353
2354         if (srv_device) {
2355                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2356                 if (r < 0) {
2357                         log_error("Failed to mount server data directory: %s", strerror(-r));
2358                         return r;
2359                 }
2360         }
2361
2362         return 0;
2363 }
2364
2365 static void loop_remove(int nr, int *image_fd) {
2366         _cleanup_close_ int control = -1;
2367
2368         if (nr < 0)
2369                 return;
2370
2371         if (image_fd && *image_fd >= 0) {
2372                 ioctl(*image_fd, LOOP_CLR_FD);
2373                 *image_fd = safe_close(*image_fd);
2374         }
2375
2376         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2377         if (control < 0)
2378                 return;
2379
2380         ioctl(control, LOOP_CTL_REMOVE, nr);
2381 }
2382
2383 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2384         int pipe_fds[2];
2385         pid_t pid;
2386
2387         assert(database);
2388         assert(key);
2389         assert(rpid);
2390
2391         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2392                 log_error("Failed to allocate pipe: %m");
2393                 return -errno;
2394         }
2395
2396         pid = fork();
2397         if (pid < 0) {
2398                 log_error("Failed to fork getent child: %m");
2399                 return -errno;
2400         } else if (pid == 0) {
2401                 int nullfd;
2402                 char *empty_env = NULL;
2403
2404                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2405                         _exit(EXIT_FAILURE);
2406
2407                 if (pipe_fds[0] > 2)
2408                         safe_close(pipe_fds[0]);
2409                 if (pipe_fds[1] > 2)
2410                         safe_close(pipe_fds[1]);
2411
2412                 nullfd = open("/dev/null", O_RDWR);
2413                 if (nullfd < 0)
2414                         _exit(EXIT_FAILURE);
2415
2416                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2417                         _exit(EXIT_FAILURE);
2418
2419                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2420                         _exit(EXIT_FAILURE);
2421
2422                 if (nullfd > 2)
2423                         safe_close(nullfd);
2424
2425                 reset_all_signal_handlers();
2426                 close_all_fds(NULL, 0);
2427
2428                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2429                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2430                 _exit(EXIT_FAILURE);
2431         }
2432
2433         pipe_fds[1] = safe_close(pipe_fds[1]);
2434
2435         *rpid = pid;
2436
2437         return pipe_fds[0];
2438 }
2439
2440 static int change_uid_gid(char **_home) {
2441         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2442         _cleanup_free_ uid_t *uids = NULL;
2443         _cleanup_free_ char *home = NULL;
2444         _cleanup_fclose_ FILE *f = NULL;
2445         _cleanup_close_ int fd = -1;
2446         unsigned n_uids = 0;
2447         size_t sz = 0, l;
2448         uid_t uid;
2449         gid_t gid;
2450         pid_t pid;
2451         int r;
2452
2453         assert(_home);
2454
2455         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2456                 /* Reset everything fully to 0, just in case */
2457
2458                 if (setgroups(0, NULL) < 0) {
2459                         log_error("setgroups() failed: %m");
2460                         return -errno;
2461                 }
2462
2463                 if (setresgid(0, 0, 0) < 0) {
2464                         log_error("setregid() failed: %m");
2465                         return -errno;
2466                 }
2467
2468                 if (setresuid(0, 0, 0) < 0) {
2469                         log_error("setreuid() failed: %m");
2470                         return -errno;
2471                 }
2472
2473                 *_home = NULL;
2474                 return 0;
2475         }
2476
2477         /* First, get user credentials */
2478         fd = spawn_getent("passwd", arg_user, &pid);
2479         if (fd < 0)
2480                 return fd;
2481
2482         f = fdopen(fd, "r");
2483         if (!f)
2484                 return log_oom();
2485         fd = -1;
2486
2487         if (!fgets(line, sizeof(line), f)) {
2488
2489                 if (!ferror(f)) {
2490                         log_error("Failed to resolve user %s.", arg_user);
2491                         return -ESRCH;
2492                 }
2493
2494                 log_error("Failed to read from getent: %m");
2495                 return -errno;
2496         }
2497
2498         truncate_nl(line);
2499
2500         wait_for_terminate_and_warn("getent passwd", pid);
2501
2502         x = strchr(line, ':');
2503         if (!x) {
2504                 log_error("/etc/passwd entry has invalid user field.");
2505                 return -EIO;
2506         }
2507
2508         u = strchr(x+1, ':');
2509         if (!u) {
2510                 log_error("/etc/passwd entry has invalid password field.");
2511                 return -EIO;
2512         }
2513
2514         u++;
2515         g = strchr(u, ':');
2516         if (!g) {
2517                 log_error("/etc/passwd entry has invalid UID field.");
2518                 return -EIO;
2519         }
2520
2521         *g = 0;
2522         g++;
2523         x = strchr(g, ':');
2524         if (!x) {
2525                 log_error("/etc/passwd entry has invalid GID field.");
2526                 return -EIO;
2527         }
2528
2529         *x = 0;
2530         h = strchr(x+1, ':');
2531         if (!h) {
2532                 log_error("/etc/passwd entry has invalid GECOS field.");
2533                 return -EIO;
2534         }
2535
2536         h++;
2537         x = strchr(h, ':');
2538         if (!x) {
2539                 log_error("/etc/passwd entry has invalid home directory field.");
2540                 return -EIO;
2541         }
2542
2543         *x = 0;
2544
2545         r = parse_uid(u, &uid);
2546         if (r < 0) {
2547                 log_error("Failed to parse UID of user.");
2548                 return -EIO;
2549         }
2550
2551         r = parse_gid(g, &gid);
2552         if (r < 0) {
2553                 log_error("Failed to parse GID of user.");
2554                 return -EIO;
2555         }
2556
2557         home = strdup(h);
2558         if (!home)
2559                 return log_oom();
2560
2561         /* Second, get group memberships */
2562         fd = spawn_getent("initgroups", arg_user, &pid);
2563         if (fd < 0)
2564                 return fd;
2565
2566         fclose(f);
2567         f = fdopen(fd, "r");
2568         if (!f)
2569                 return log_oom();
2570         fd = -1;
2571
2572         if (!fgets(line, sizeof(line), f)) {
2573                 if (!ferror(f)) {
2574                         log_error("Failed to resolve user %s.", arg_user);
2575                         return -ESRCH;
2576                 }
2577
2578                 log_error("Failed to read from getent: %m");
2579                 return -errno;
2580         }
2581
2582         truncate_nl(line);
2583
2584         wait_for_terminate_and_warn("getent initgroups", pid);
2585
2586         /* Skip over the username and subsequent separator whitespace */
2587         x = line;
2588         x += strcspn(x, WHITESPACE);
2589         x += strspn(x, WHITESPACE);
2590
2591         FOREACH_WORD(w, l, x, state) {
2592                 char c[l+1];
2593
2594                 memcpy(c, w, l);
2595                 c[l] = 0;
2596
2597                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2598                         return log_oom();
2599
2600                 r = parse_uid(c, &uids[n_uids++]);
2601                 if (r < 0) {
2602                         log_error("Failed to parse group data from getent.");
2603                         return -EIO;
2604                 }
2605         }
2606
2607         r = mkdir_parents(home, 0775);
2608         if (r < 0) {
2609                 log_error("Failed to make home root directory: %s", strerror(-r));
2610                 return r;
2611         }
2612
2613         r = mkdir_safe(home, 0755, uid, gid);
2614         if (r < 0 && r != -EEXIST) {
2615                 log_error("Failed to make home directory: %s", strerror(-r));
2616                 return r;
2617         }
2618
2619         fchown(STDIN_FILENO, uid, gid);
2620         fchown(STDOUT_FILENO, uid, gid);
2621         fchown(STDERR_FILENO, uid, gid);
2622
2623         if (setgroups(n_uids, uids) < 0) {
2624                 log_error("Failed to set auxiliary groups: %m");
2625                 return -errno;
2626         }
2627
2628         if (setresgid(gid, gid, gid) < 0) {
2629                 log_error("setregid() failed: %m");
2630                 return -errno;
2631         }
2632
2633         if (setresuid(uid, uid, uid) < 0) {
2634                 log_error("setreuid() failed: %m");
2635                 return -errno;
2636         }
2637
2638         if (_home) {
2639                 *_home = home;
2640                 home = NULL;
2641         }
2642
2643         return 0;
2644 }
2645
2646 /*
2647  * Return 0 in case the container is being rebooted, has been shut
2648  * down or exited successfully. On failures a negative value is
2649  * returned.
2650  *
2651  * The status of the container "CONTAINER_TERMINATED" or
2652  * "CONTAINER_REBOOTED" will be saved in the container argument
2653  */
2654 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2655         int r;
2656         siginfo_t status;
2657
2658         r = wait_for_terminate(pid, &status);
2659         if (r < 0)
2660                 return r;
2661
2662         switch (status.si_code) {
2663         case CLD_EXITED:
2664                 r = status.si_status;
2665                 if (r == 0) {
2666                         if (!arg_quiet)
2667                                 log_debug("Container %s exited successfully.",
2668                                           arg_machine);
2669
2670                         *container = CONTAINER_TERMINATED;
2671                 } else {
2672                         log_error("Container %s failed with error code %i.",
2673                                   arg_machine, status.si_status);
2674                         r = -1;
2675                 }
2676                 break;
2677
2678         case CLD_KILLED:
2679                 if (status.si_status == SIGINT) {
2680                         if (!arg_quiet)
2681                                 log_info("Container %s has been shut down.",
2682                                          arg_machine);
2683
2684                         *container = CONTAINER_TERMINATED;
2685                         r = 0;
2686                         break;
2687                 } else if (status.si_status == SIGHUP) {
2688                         if (!arg_quiet)
2689                                 log_info("Container %s is being rebooted.",
2690                                          arg_machine);
2691
2692                         *container = CONTAINER_REBOOTED;
2693                         r = 0;
2694                         break;
2695                 }
2696                 /* CLD_KILLED fallthrough */
2697
2698         case CLD_DUMPED:
2699                 log_error("Container %s terminated by signal %s.",
2700                           arg_machine, signal_to_string(status.si_status));
2701                 r = -1;
2702                 break;
2703
2704         default:
2705                 log_error("Container %s failed due to unknown reason.",
2706                           arg_machine);
2707                 r = -1;
2708                 break;
2709         }
2710
2711         return r;
2712 }
2713
2714 static void nop_handler(int sig) {}
2715
2716 int main(int argc, char *argv[]) {
2717
2718         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2719         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2720         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2721         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2722         _cleanup_fdset_free_ FDSet *fds = NULL;
2723         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2724         const char *console = NULL;
2725         char veth_name[IFNAMSIZ];
2726         bool secondary = false;
2727         sigset_t mask, mask_chld;
2728         pid_t pid = 0;
2729
2730         log_parse_environment();
2731         log_open();
2732
2733         k = parse_argv(argc, argv);
2734         if (k < 0)
2735                 goto finish;
2736         else if (k == 0) {
2737                 r = EXIT_SUCCESS;
2738                 goto finish;
2739         }
2740
2741         if (!arg_image) {
2742                 if (arg_directory) {
2743                         char *p;
2744
2745                         p = path_make_absolute_cwd(arg_directory);
2746                         free(arg_directory);
2747                         arg_directory = p;
2748                 } else
2749                         arg_directory = get_current_dir_name();
2750
2751                 if (!arg_directory) {
2752                         log_error("Failed to determine path, please use -D.");
2753                         goto finish;
2754                 }
2755                 path_kill_slashes(arg_directory);
2756         }
2757
2758         if (!arg_machine) {
2759                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2760                 if (!arg_machine) {
2761                         log_oom();
2762                         goto finish;
2763                 }
2764
2765                 hostname_cleanup(arg_machine, false);
2766                 if (isempty(arg_machine)) {
2767                         log_error("Failed to determine machine name automatically, please use -M.");
2768                         goto finish;
2769                 }
2770         }
2771
2772         if (geteuid() != 0) {
2773                 log_error("Need to be root.");
2774                 goto finish;
2775         }
2776
2777         if (sd_booted() <= 0) {
2778                 log_error("Not running on a systemd system.");
2779                 goto finish;
2780         }
2781
2782         log_close();
2783         n_fd_passed = sd_listen_fds(false);
2784         if (n_fd_passed > 0) {
2785                 k = fdset_new_listen_fds(&fds, false);
2786                 if (k < 0) {
2787                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2788                         goto finish;
2789                 }
2790         }
2791         fdset_close_others(fds);
2792         log_open();
2793
2794         if (arg_directory) {
2795                 if (path_equal(arg_directory, "/")) {
2796                         log_error("Spawning container on root directory not supported.");
2797                         goto finish;
2798                 }
2799
2800                 if (arg_boot) {
2801                         if (path_is_os_tree(arg_directory) <= 0) {
2802                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2803                                 goto finish;
2804                         }
2805                 } else {
2806                         const char *p;
2807
2808                         p = strappenda(arg_directory,
2809                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2810                         if (access(p, F_OK) < 0) {
2811                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2812                                 goto finish;
2813
2814                         }
2815                 }
2816         } else {
2817                 char template[] = "/tmp/nspawn-root-XXXXXX";
2818
2819                 if (!mkdtemp(template)) {
2820                         log_error("Failed to create temporary directory: %m");
2821                         r = -errno;
2822                         goto finish;
2823                 }
2824
2825                 arg_directory = strdup(template);
2826                 if (!arg_directory) {
2827                         r = log_oom();
2828                         goto finish;
2829                 }
2830
2831                 image_fd = setup_image(&device_path, &loop_nr);
2832                 if (image_fd < 0) {
2833                         r = image_fd;
2834                         goto finish;
2835                 }
2836
2837                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2838                 if (r < 0)
2839                         goto finish;
2840         }
2841
2842         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2843         if (master < 0) {
2844                 log_error("Failed to acquire pseudo tty: %m");
2845                 goto finish;
2846         }
2847
2848         console = ptsname(master);
2849         if (!console) {
2850                 log_error("Failed to determine tty name: %m");
2851                 goto finish;
2852         }
2853
2854         if (!arg_quiet)
2855                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2856                          arg_machine, arg_image ? arg_image : arg_directory);
2857
2858         if (unlockpt(master) < 0) {
2859                 log_error("Failed to unlock tty: %m");
2860                 goto finish;
2861         }
2862
2863         if (access("/dev/kdbus/control", F_OK) >= 0) {
2864
2865                 if (arg_share_system) {
2866                         kdbus_domain = strdup("/dev/kdbus");
2867                         if (!kdbus_domain) {
2868                                 log_oom();
2869                                 goto finish;
2870                         }
2871                 } else {
2872                         const char *ns;
2873
2874                         ns = strappenda("machine-", arg_machine);
2875                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2876                         if (r < 0)
2877                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2878                         else
2879                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2880                 }
2881         }
2882
2883         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2884                 log_error("Failed to create kmsg socket pair: %m");
2885                 goto finish;
2886         }
2887
2888         sd_notify(0, "READY=1");
2889
2890         assert_se(sigemptyset(&mask) == 0);
2891         assert_se(sigemptyset(&mask_chld) == 0);
2892         sigaddset(&mask_chld, SIGCHLD);
2893         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2894         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2895
2896         for (;;) {
2897                 ContainerStatus container_status;
2898                 int eventfds[2] = { -1, -1 };
2899                 struct sigaction sa = {
2900                         .sa_handler = nop_handler,
2901                         .sa_flags = SA_NOCLDSTOP,
2902                 };
2903
2904                 /* Child can be killed before execv(), so handle SIGCHLD
2905                  * in order to interrupt parent's blocking calls and
2906                  * give it a chance to call wait() and terminate. */
2907                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2908                 if (r < 0) {
2909                         log_error("Failed to change the signal mask: %m");
2910                         goto finish;
2911                 }
2912
2913                 r = sigaction(SIGCHLD, &sa, NULL);
2914                 if (r < 0) {
2915                         log_error("Failed to install SIGCHLD handler: %m");
2916                         goto finish;
2917                 }
2918
2919                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2920                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2921                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2922                 if (pid < 0) {
2923                         if (errno == EINVAL)
2924                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2925                         else
2926                                 log_error("clone() failed: %m");
2927
2928                         r = pid;
2929                         goto finish;
2930                 }
2931
2932                 if (pid == 0) {
2933                         /* child */
2934                         _cleanup_free_ char *home = NULL;
2935                         unsigned n_env = 2;
2936                         const char *envp[] = {
2937                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2938                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2939                                 NULL, /* TERM */
2940                                 NULL, /* HOME */
2941                                 NULL, /* USER */
2942                                 NULL, /* LOGNAME */
2943                                 NULL, /* container_uuid */
2944                                 NULL, /* LISTEN_FDS */
2945                                 NULL, /* LISTEN_PID */
2946                                 NULL
2947                         };
2948                         char **env_use;
2949
2950                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2951                         if (envp[n_env])
2952                                 n_env ++;
2953
2954                         master = safe_close(master);
2955
2956                         close_nointr(STDIN_FILENO);
2957                         close_nointr(STDOUT_FILENO);
2958                         close_nointr(STDERR_FILENO);
2959
2960                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2961
2962                         reset_all_signal_handlers();
2963
2964                         assert_se(sigemptyset(&mask) == 0);
2965                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2966
2967                         k = open_terminal(console, O_RDWR);
2968                         if (k != STDIN_FILENO) {
2969                                 if (k >= 0) {
2970                                         safe_close(k);
2971                                         k = -EINVAL;
2972                                 }
2973
2974                                 log_error("Failed to open console: %s", strerror(-k));
2975                                 goto child_fail;
2976                         }
2977
2978                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2979                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2980                                 log_error("Failed to duplicate console: %m");
2981                                 goto child_fail;
2982                         }
2983
2984                         if (setsid() < 0) {
2985                                 log_error("setsid() failed: %m");
2986                                 goto child_fail;
2987                         }
2988
2989                         if (reset_audit_loginuid() < 0)
2990                                 goto child_fail;
2991
2992                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2993                                 log_error("PR_SET_PDEATHSIG failed: %m");
2994                                 goto child_fail;
2995                         }
2996
2997                         /* Mark everything as slave, so that we still
2998                          * receive mounts from the real root, but don't
2999                          * propagate mounts to the real root. */
3000                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3001                                 log_error("MS_SLAVE|MS_REC failed: %m");
3002                                 goto child_fail;
3003                         }
3004
3005                         if (mount_devices(arg_directory,
3006                                           root_device, root_device_rw,
3007                                           home_device, home_device_rw,
3008                                           srv_device, srv_device_rw) < 0)
3009                                 goto child_fail;
3010
3011                         /* Turn directory into bind mount */
3012                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3013                                 log_error("Failed to make bind mount: %m");
3014                                 goto child_fail;
3015                         }
3016
3017                         if (arg_read_only) {
3018                                 k = bind_remount_recursive(arg_directory, true);
3019                                 if (k < 0) {
3020                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3021                                         goto child_fail;
3022                                 }
3023                         }
3024
3025                         if (mount_all(arg_directory) < 0)
3026                                 goto child_fail;
3027
3028                         if (copy_devnodes(arg_directory) < 0)
3029                                 goto child_fail;
3030
3031                         if (setup_ptmx(arg_directory) < 0)
3032                                 goto child_fail;
3033
3034                         dev_setup(arg_directory);
3035
3036                         if (audit_still_doesnt_work_in_containers() < 0)
3037                                 goto child_fail;
3038
3039                         if (setup_dev_console(arg_directory, console) < 0)
3040                                 goto child_fail;
3041
3042                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3043                                 goto child_fail;
3044
3045                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3046
3047                         if (setup_boot_id(arg_directory) < 0)
3048                                 goto child_fail;
3049
3050                         if (setup_timezone(arg_directory) < 0)
3051                                 goto child_fail;
3052
3053                         if (setup_resolv_conf(arg_directory) < 0)
3054                                 goto child_fail;
3055
3056                         if (setup_journal(arg_directory) < 0)
3057                                 goto child_fail;
3058
3059                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3060                                 goto child_fail;
3061
3062                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3063                                 goto child_fail;
3064
3065                         if (mount_tmpfs(arg_directory) < 0)
3066                                 goto child_fail;
3067
3068                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3069                                 goto child_fail;
3070
3071                         /* Tell the parent that we are ready, and that
3072                          * it can cgroupify us to that we lack access
3073                          * to certain devices and resources. */
3074                         r = eventfd_send_state(eventfds[1],
3075                                                EVENTFD_CHILD_SUCCEEDED);
3076                         eventfds[1] = safe_close(eventfds[1]);
3077                         if (r < 0)
3078                                 goto child_fail;
3079
3080                         if (chdir(arg_directory) < 0) {
3081                                 log_error("chdir(%s) failed: %m", arg_directory);
3082                                 goto child_fail;
3083                         }
3084
3085                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3086                                 log_error("mount(MS_MOVE) failed: %m");
3087                                 goto child_fail;
3088                         }
3089
3090                         if (chroot(".") < 0) {
3091                                 log_error("chroot() failed: %m");
3092                                 goto child_fail;
3093                         }
3094
3095                         if (chdir("/") < 0) {
3096                                 log_error("chdir() failed: %m");
3097                                 goto child_fail;
3098                         }
3099
3100                         umask(0022);
3101
3102                         if (arg_private_network)
3103                                 loopback_setup();
3104
3105                         if (drop_capabilities() < 0) {
3106                                 log_error("drop_capabilities() failed: %m");
3107                                 goto child_fail;
3108                         }
3109
3110                         r = change_uid_gid(&home);
3111                         if (r < 0)
3112                                 goto child_fail;
3113
3114                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3115                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3116                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3117                                 log_oom();
3118                                 goto child_fail;
3119                         }
3120
3121                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3122                                 char as_uuid[37];
3123
3124                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3125                                         log_oom();
3126                                         goto child_fail;
3127                                 }
3128                         }
3129
3130                         if (fdset_size(fds) > 0) {
3131                                 k = fdset_cloexec(fds, false);
3132                                 if (k < 0) {
3133                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3134                                         goto child_fail;
3135                                 }
3136
3137                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3138                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3139                                         log_oom();
3140                                         goto child_fail;
3141                                 }
3142                         }
3143
3144                         setup_hostname();
3145
3146                         if (arg_personality != 0xffffffffLU) {
3147                                 if (personality(arg_personality) < 0) {
3148                                         log_error("personality() failed: %m");
3149                                         goto child_fail;
3150                                 }
3151                         } else if (secondary) {
3152                                 if (personality(PER_LINUX32) < 0) {
3153                                         log_error("personality() failed: %m");
3154                                         goto child_fail;
3155                                 }
3156                         }
3157
3158 #ifdef HAVE_SELINUX
3159                         if (arg_selinux_context)
3160                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3161                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3162                                         goto child_fail;
3163                                 }
3164 #endif
3165
3166                         if (!strv_isempty(arg_setenv)) {
3167                                 char **n;
3168
3169                                 n = strv_env_merge(2, envp, arg_setenv);
3170                                 if (!n) {
3171                                         log_oom();
3172                                         goto child_fail;
3173                                 }
3174
3175                                 env_use = n;
3176                         } else
3177                                 env_use = (char**) envp;
3178
3179                         /* Wait until the parent is ready with the setup, too... */
3180                         r = eventfd_parent_succeeded(eventfds[0]);
3181                         eventfds[0] = safe_close(eventfds[0]);
3182                         if (r < 0)
3183                                 goto child_fail;
3184
3185                         if (arg_boot) {
3186                                 char **a;
3187                                 size_t l;
3188
3189                                 /* Automatically search for the init system */
3190
3191                                 l = 1 + argc - optind;
3192                                 a = newa(char*, l + 1);
3193                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3194
3195                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3196                                 execve(a[0], a, env_use);
3197
3198                                 a[0] = (char*) "/lib/systemd/systemd";
3199                                 execve(a[0], a, env_use);
3200
3201                                 a[0] = (char*) "/sbin/init";
3202                                 execve(a[0], a, env_use);
3203                         } else if (argc > optind)
3204                                 execvpe(argv[optind], argv + optind, env_use);
3205                         else {
3206                                 chdir(home ? home : "/root");
3207                                 execle("/bin/bash", "-bash", NULL, env_use);
3208                                 execle("/bin/sh", "-sh", NULL, env_use);
3209                         }
3210
3211                         log_error("execv() failed: %m");
3212
3213                 child_fail:
3214                         /* Tell the parent that the setup failed, so he
3215                          * can clean up resources and terminate. */
3216                         if (eventfds[1] != -1)
3217                                 eventfd_send_state(eventfds[1],
3218                                                    EVENTFD_CHILD_FAILED);
3219                         _exit(EXIT_FAILURE);
3220                 }
3221
3222                 fdset_free(fds);
3223                 fds = NULL;
3224
3225                 /* Wait for the child event:
3226                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3227                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3228                  * it is ready with all it needs to do with priviliges.
3229                  * After we got the notification we can make the process
3230                  * join its cgroup which might limit what it can do */
3231                 r = eventfd_child_succeeded(eventfds[1]);
3232                 eventfds[1] = safe_close(eventfds[1]);
3233                 if (r < 0)
3234                         goto check_container_status;
3235
3236                 r = register_machine(pid);
3237                 if (r < 0)
3238                         goto finish;
3239
3240                 r = move_network_interfaces(pid);
3241                 if (r < 0)
3242                         goto finish;
3243
3244                 r = setup_veth(pid, veth_name);
3245                 if (r < 0)
3246                         goto finish;
3247
3248                 r = setup_bridge(veth_name);
3249                 if (r < 0)
3250                         goto finish;
3251
3252                 r = setup_macvlan(pid);
3253                 if (r < 0)
3254                         goto finish;
3255
3256                 /* Block SIGCHLD here, before notifying child.
3257                  * process_pty() will handle it with the other signals. */
3258                 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3259                 if (r < 0)
3260                         goto finish;
3261
3262                 /* Reset signal to default */
3263                 r = default_signals(SIGCHLD, -1);
3264                 if (r < 0)
3265                         goto finish;
3266
3267                 /* Notify the child that the parent is ready with all
3268                  * its setup, and that the child can now hand over
3269                  * control to the code to run inside the container. */
3270                 r = eventfd_send_state(eventfds[0],
3271                                        EVENTFD_PARENT_SUCCEEDED);
3272                 eventfds[0] = safe_close(eventfds[0]);
3273                 if (r < 0)
3274                         goto finish;
3275
3276                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3277                 if (k < 0) {
3278                         r = EXIT_FAILURE;
3279                         break;
3280                 }
3281
3282                 if (!arg_quiet)
3283                         putc('\n', stdout);
3284
3285                 /* Kill if it is not dead yet anyway */
3286                 terminate_machine(pid);
3287
3288 check_container_status:
3289                 /* Redundant, but better safe than sorry */
3290                 kill(pid, SIGKILL);
3291
3292                 r = wait_for_container(pid, &container_status);
3293                 pid = 0;
3294
3295                 if (r < 0) {
3296                         r = EXIT_FAILURE;
3297                         break;
3298                 } else if (container_status == CONTAINER_TERMINATED)
3299                         break;
3300
3301                 /* CONTAINER_REBOOTED, loop again */
3302         }
3303
3304 finish:
3305         loop_remove(loop_nr, &image_fd);
3306
3307         if (pid > 0)
3308                 kill(pid, SIGKILL);
3309
3310         free(arg_directory);
3311         free(arg_machine);
3312         free(arg_user);
3313         strv_free(arg_setenv);
3314         strv_free(arg_network_interfaces);
3315         strv_free(arg_network_macvlan);
3316         strv_free(arg_bind);
3317         strv_free(arg_bind_ro);
3318         strv_free(arg_tmpfs);
3319
3320         return r;
3321 }