chiark / gitweb /
sd-bus: sync with kdbus upstream (ABI break)
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static uint64_t arg_retain =
128         (1ULL << CAP_CHOWN) |
129         (1ULL << CAP_DAC_OVERRIDE) |
130         (1ULL << CAP_DAC_READ_SEARCH) |
131         (1ULL << CAP_FOWNER) |
132         (1ULL << CAP_FSETID) |
133         (1ULL << CAP_IPC_OWNER) |
134         (1ULL << CAP_KILL) |
135         (1ULL << CAP_LEASE) |
136         (1ULL << CAP_LINUX_IMMUTABLE) |
137         (1ULL << CAP_NET_BIND_SERVICE) |
138         (1ULL << CAP_NET_BROADCAST) |
139         (1ULL << CAP_NET_RAW) |
140         (1ULL << CAP_SETGID) |
141         (1ULL << CAP_SETFCAP) |
142         (1ULL << CAP_SETPCAP) |
143         (1ULL << CAP_SETUID) |
144         (1ULL << CAP_SYS_ADMIN) |
145         (1ULL << CAP_SYS_CHROOT) |
146         (1ULL << CAP_SYS_NICE) |
147         (1ULL << CAP_SYS_PTRACE) |
148         (1ULL << CAP_SYS_TTY_CONFIG) |
149         (1ULL << CAP_SYS_RESOURCE) |
150         (1ULL << CAP_SYS_BOOT) |
151         (1ULL << CAP_AUDIT_WRITE) |
152         (1ULL << CAP_AUDIT_CONTROL) |
153         (1ULL << CAP_MKNOD);
154 static char **arg_bind = NULL;
155 static char **arg_bind_ro = NULL;
156 static char **arg_tmpfs = NULL;
157 static char **arg_setenv = NULL;
158 static bool arg_quiet = false;
159 static bool arg_share_system = false;
160 static bool arg_register = true;
161 static bool arg_keep_unit = false;
162 static char **arg_network_interfaces = NULL;
163 static char **arg_network_macvlan = NULL;
164 static bool arg_network_veth = false;
165 static const char *arg_network_bridge = NULL;
166 static unsigned long arg_personality = 0xffffffffLU;
167 static const char *arg_image = NULL;
168 static Volatile arg_volatile = VOLATILE_NO;
169
170 static void help(void) {
171         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
172                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
173                "  -h --help                 Show this help\n"
174                "     --version              Print version string\n"
175                "  -q --quiet                Do not show status information\n"
176                "  -D --directory=PATH       Root directory for the container\n"
177                "  -i --image=PATH           File system device or image for the container\n"
178                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
179                "  -u --user=USER            Run the command under specified user or uid\n"
180                "  -M --machine=NAME         Set the machine name for the container\n"
181                "     --uuid=UUID            Set a specific machine UUID for the container\n"
182                "  -S --slice=SLICE          Place the container in the specified slice\n"
183                "     --private-network      Disable network in container\n"
184                "     --network-interface=INTERFACE\n"
185                "                            Assign an existing network interface to the\n"
186                "                            container\n"
187                "     --network-macvlan=INTERFACE\n"
188                "                            Create a macvlan network interface based on an\n"
189                "                            existing network interface to the container\n"
190                "     --network-veth         Add a virtual ethernet connection between host\n"
191                "                            and container\n"
192                "     --network-bridge=INTERFACE\n"
193                "                            Add a virtual ethernet connection between host\n"
194                "                            and container and add it to an existing bridge on\n"
195                "                            the host\n"
196                "  -Z --selinux-context=SECLABEL\n"
197                "                            Set the SELinux security context to be used by\n"
198                "                            processes in the container\n"
199                "  -L --selinux-apifs-context=SECLABEL\n"
200                "                            Set the SELinux security context to be used by\n"
201                "                            API/tmpfs file systems in the container\n"
202                "     --capability=CAP       In addition to the default, retain specified\n"
203                "                            capability\n"
204                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
205                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
206                "  -j                        Equivalent to --link-journal=host\n"
207                "     --read-only            Mount the root directory read-only\n"
208                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
209                "                            the container\n"
210                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
211                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
212                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
213                "     --share-system         Share system namespaces with host\n"
214                "     --register=BOOLEAN     Register container as machine\n"
215                "     --keep-unit            Do not register a scope for the machine, reuse\n"
216                "                            the service unit nspawn is running in\n"
217                "     --volatile[=MODE]      Run the system in volatile mode\n",
218                program_invocation_short_name);
219 }
220
221 static int parse_argv(int argc, char *argv[]) {
222
223         enum {
224                 ARG_VERSION = 0x100,
225                 ARG_PRIVATE_NETWORK,
226                 ARG_UUID,
227                 ARG_READ_ONLY,
228                 ARG_CAPABILITY,
229                 ARG_DROP_CAPABILITY,
230                 ARG_LINK_JOURNAL,
231                 ARG_BIND,
232                 ARG_BIND_RO,
233                 ARG_TMPFS,
234                 ARG_SETENV,
235                 ARG_SHARE_SYSTEM,
236                 ARG_REGISTER,
237                 ARG_KEEP_UNIT,
238                 ARG_NETWORK_INTERFACE,
239                 ARG_NETWORK_MACVLAN,
240                 ARG_NETWORK_VETH,
241                 ARG_NETWORK_BRIDGE,
242                 ARG_PERSONALITY,
243                 ARG_VOLATILE,
244         };
245
246         static const struct option options[] = {
247                 { "help",                  no_argument,       NULL, 'h'                   },
248                 { "version",               no_argument,       NULL, ARG_VERSION           },
249                 { "directory",             required_argument, NULL, 'D'                   },
250                 { "user",                  required_argument, NULL, 'u'                   },
251                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
252                 { "boot",                  no_argument,       NULL, 'b'                   },
253                 { "uuid",                  required_argument, NULL, ARG_UUID              },
254                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
255                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
256                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
257                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
258                 { "bind",                  required_argument, NULL, ARG_BIND              },
259                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
260                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
261                 { "machine",               required_argument, NULL, 'M'                   },
262                 { "slice",                 required_argument, NULL, 'S'                   },
263                 { "setenv",                required_argument, NULL, ARG_SETENV            },
264                 { "selinux-context",       required_argument, NULL, 'Z'                   },
265                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
266                 { "quiet",                 no_argument,       NULL, 'q'                   },
267                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
268                 { "register",              required_argument, NULL, ARG_REGISTER          },
269                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
270                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
271                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
272                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
273                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
274                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
275                 { "image",                 required_argument, NULL, 'i'                   },
276                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
277                 {}
278         };
279
280         int c, r;
281         uint64_t plus = 0, minus = 0;
282
283         assert(argc >= 0);
284         assert(argv);
285
286         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
287
288                 switch (c) {
289
290                 case 'h':
291                         help();
292                         return 0;
293
294                 case ARG_VERSION:
295                         puts(PACKAGE_STRING);
296                         puts(SYSTEMD_FEATURES);
297                         return 0;
298
299                 case 'D':
300                         free(arg_directory);
301                         arg_directory = canonicalize_file_name(optarg);
302                         if (!arg_directory) {
303                                 log_error("Invalid root directory: %m");
304                                 return -ENOMEM;
305                         }
306
307                         break;
308
309                 case 'i':
310                         arg_image = optarg;
311                         break;
312
313                 case 'u':
314                         free(arg_user);
315                         arg_user = strdup(optarg);
316                         if (!arg_user)
317                                 return log_oom();
318
319                         break;
320
321                 case ARG_NETWORK_BRIDGE:
322                         arg_network_bridge = optarg;
323
324                         /* fall through */
325
326                 case ARG_NETWORK_VETH:
327                         arg_network_veth = true;
328                         arg_private_network = true;
329                         break;
330
331                 case ARG_NETWORK_INTERFACE:
332                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
333                                 return log_oom();
334
335                         arg_private_network = true;
336                         break;
337
338                 case ARG_NETWORK_MACVLAN:
339                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
340                                 return log_oom();
341
342                         /* fall through */
343
344                 case ARG_PRIVATE_NETWORK:
345                         arg_private_network = true;
346                         break;
347
348                 case 'b':
349                         arg_boot = true;
350                         break;
351
352                 case ARG_UUID:
353                         r = sd_id128_from_string(optarg, &arg_uuid);
354                         if (r < 0) {
355                                 log_error("Invalid UUID: %s", optarg);
356                                 return r;
357                         }
358                         break;
359
360                 case 'S':
361                         arg_slice = optarg;
362                         break;
363
364                 case 'M':
365                         if (isempty(optarg)) {
366                                 free(arg_machine);
367                                 arg_machine = NULL;
368                         } else {
369
370                                 if (!hostname_is_valid(optarg)) {
371                                         log_error("Invalid machine name: %s", optarg);
372                                         return -EINVAL;
373                                 }
374
375                                 free(arg_machine);
376                                 arg_machine = strdup(optarg);
377                                 if (!arg_machine)
378                                         return log_oom();
379
380                                 break;
381                         }
382
383                 case 'Z':
384                         arg_selinux_context = optarg;
385                         break;
386
387                 case 'L':
388                         arg_selinux_apifs_context = optarg;
389                         break;
390
391                 case ARG_READ_ONLY:
392                         arg_read_only = true;
393                         break;
394
395                 case ARG_CAPABILITY:
396                 case ARG_DROP_CAPABILITY: {
397                         const char *state, *word;
398                         size_t length;
399
400                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
401                                 _cleanup_free_ char *t;
402                                 cap_value_t cap;
403
404                                 t = strndup(word, length);
405                                 if (!t)
406                                         return log_oom();
407
408                                 if (streq(t, "all")) {
409                                         if (c == ARG_CAPABILITY)
410                                                 plus = (uint64_t) -1;
411                                         else
412                                                 minus = (uint64_t) -1;
413                                 } else {
414                                         if (cap_from_name(t, &cap) < 0) {
415                                                 log_error("Failed to parse capability %s.", t);
416                                                 return -EINVAL;
417                                         }
418
419                                         if (c == ARG_CAPABILITY)
420                                                 plus |= 1ULL << (uint64_t) cap;
421                                         else
422                                                 minus |= 1ULL << (uint64_t) cap;
423                                 }
424                         }
425
426                         break;
427                 }
428
429                 case 'j':
430                         arg_link_journal = LINK_GUEST;
431                         break;
432
433                 case ARG_LINK_JOURNAL:
434                         if (streq(optarg, "auto"))
435                                 arg_link_journal = LINK_AUTO;
436                         else if (streq(optarg, "no"))
437                                 arg_link_journal = LINK_NO;
438                         else if (streq(optarg, "guest"))
439                                 arg_link_journal = LINK_GUEST;
440                         else if (streq(optarg, "host"))
441                                 arg_link_journal = LINK_HOST;
442                         else {
443                                 log_error("Failed to parse link journal mode %s", optarg);
444                                 return -EINVAL;
445                         }
446
447                         break;
448
449                 case ARG_BIND:
450                 case ARG_BIND_RO: {
451                         _cleanup_free_ char *a = NULL, *b = NULL;
452                         char *e;
453                         char ***x;
454
455                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
456
457                         e = strchr(optarg, ':');
458                         if (e) {
459                                 a = strndup(optarg, e - optarg);
460                                 b = strdup(e + 1);
461                         } else {
462                                 a = strdup(optarg);
463                                 b = strdup(optarg);
464                         }
465
466                         if (!a || !b)
467                                 return log_oom();
468
469                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
470                                 log_error("Invalid bind mount specification: %s", optarg);
471                                 return -EINVAL;
472                         }
473
474                         r = strv_extend(x, a);
475                         if (r < 0)
476                                 return log_oom();
477
478                         r = strv_extend(x, b);
479                         if (r < 0)
480                                 return log_oom();
481
482                         break;
483                 }
484
485                 case ARG_TMPFS: {
486                         _cleanup_free_ char *a = NULL, *b = NULL;
487                         char *e;
488
489                         e = strchr(optarg, ':');
490                         if (e) {
491                                 a = strndup(optarg, e - optarg);
492                                 b = strdup(e + 1);
493                         } else {
494                                 a = strdup(optarg);
495                                 b = strdup("mode=0755");
496                         }
497
498                         if (!a || !b)
499                                 return log_oom();
500
501                         if (!path_is_absolute(a)) {
502                                 log_error("Invalid tmpfs specification: %s", optarg);
503                                 return -EINVAL;
504                         }
505
506                         r = strv_push(&arg_tmpfs, a);
507                         if (r < 0)
508                                 return log_oom();
509
510                         a = NULL;
511
512                         r = strv_push(&arg_tmpfs, b);
513                         if (r < 0)
514                                 return log_oom();
515
516                         b = NULL;
517
518                         break;
519                 }
520
521                 case ARG_SETENV: {
522                         char **n;
523
524                         if (!env_assignment_is_valid(optarg)) {
525                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
526                                 return -EINVAL;
527                         }
528
529                         n = strv_env_set(arg_setenv, optarg);
530                         if (!n)
531                                 return log_oom();
532
533                         strv_free(arg_setenv);
534                         arg_setenv = n;
535                         break;
536                 }
537
538                 case 'q':
539                         arg_quiet = true;
540                         break;
541
542                 case ARG_SHARE_SYSTEM:
543                         arg_share_system = true;
544                         break;
545
546                 case ARG_REGISTER:
547                         r = parse_boolean(optarg);
548                         if (r < 0) {
549                                 log_error("Failed to parse --register= argument: %s", optarg);
550                                 return r;
551                         }
552
553                         arg_register = r;
554                         break;
555
556                 case ARG_KEEP_UNIT:
557                         arg_keep_unit = true;
558                         break;
559
560                 case ARG_PERSONALITY:
561
562                         arg_personality = personality_from_string(optarg);
563                         if (arg_personality == 0xffffffffLU) {
564                                 log_error("Unknown or unsupported personality '%s'.", optarg);
565                                 return -EINVAL;
566                         }
567
568                         break;
569
570                 case ARG_VOLATILE:
571
572                         if (!optarg)
573                                 arg_volatile = VOLATILE_YES;
574                         else {
575                                 r = parse_boolean(optarg);
576                                 if (r < 0) {
577                                         if (streq(optarg, "state"))
578                                                 arg_volatile = VOLATILE_STATE;
579                                         else {
580                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
581                                                 return r;
582                                         }
583                                 } else
584                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
585                         }
586
587                         break;
588
589                 case '?':
590                         return -EINVAL;
591
592                 default:
593                         assert_not_reached("Unhandled option");
594                 }
595
596         if (arg_share_system)
597                 arg_register = false;
598
599         if (arg_boot && arg_share_system) {
600                 log_error("--boot and --share-system may not be combined.");
601                 return -EINVAL;
602         }
603
604         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
605                 log_error("--keep-unit may not be used when invoked from a user session.");
606                 return -EINVAL;
607         }
608
609         if (arg_directory && arg_image) {
610                 log_error("--directory= and --image= may not be combined.");
611                 return -EINVAL;
612         }
613
614         if (arg_volatile != VOLATILE_NO && arg_read_only) {
615                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
616                 return -EINVAL;
617         }
618
619         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
620
621         return 1;
622 }
623
624 static int mount_all(const char *dest) {
625
626         typedef struct MountPoint {
627                 const char *what;
628                 const char *where;
629                 const char *type;
630                 const char *options;
631                 unsigned long flags;
632                 bool fatal;
633         } MountPoint;
634
635         static const MountPoint mount_table[] = {
636                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
637                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
638                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
639                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
640                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
641                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
642                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
644 #ifdef HAVE_SELINUX
645                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
646                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
647 #endif
648         };
649
650         unsigned k;
651         int r = 0;
652
653         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
654                 _cleanup_free_ char *where = NULL;
655 #ifdef HAVE_SELINUX
656                 _cleanup_free_ char *options = NULL;
657 #endif
658                 const char *o;
659                 int t;
660
661                 where = strjoin(dest, "/", mount_table[k].where, NULL);
662                 if (!where)
663                         return log_oom();
664
665                 t = path_is_mount_point(where, true);
666                 if (t < 0) {
667                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
668
669                         if (r == 0)
670                                 r = t;
671
672                         continue;
673                 }
674
675                 /* Skip this entry if it is not a remount. */
676                 if (mount_table[k].what && t > 0)
677                         continue;
678
679                 t = mkdir_p(where, 0755);
680                 if (t < 0) {
681                         if (mount_table[k].fatal) {
682                                log_error("Failed to create directory %s: %s", where, strerror(-t));
683
684                                 if (r == 0)
685                                         r = t;
686                         } else
687                                log_warning("Failed to create directory %s: %s", where, strerror(-t));
688
689                         continue;
690                 }
691
692 #ifdef HAVE_SELINUX
693                 if (arg_selinux_apifs_context &&
694                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
695                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
696                         if (!options)
697                                 return log_oom();
698
699                         o = options;
700                 } else
701 #endif
702                         o = mount_table[k].options;
703
704
705                 if (mount(mount_table[k].what,
706                           where,
707                           mount_table[k].type,
708                           mount_table[k].flags,
709                           o) < 0) {
710
711                         if (mount_table[k].fatal) {
712                                 log_error("mount(%s) failed: %m", where);
713
714                                 if (r == 0)
715                                         r = -errno;
716                         } else
717                                 log_warning("mount(%s) failed: %m", where);
718                 }
719         }
720
721         return r;
722 }
723
724 static int mount_binds(const char *dest, char **l, bool ro) {
725         char **x, **y;
726
727         STRV_FOREACH_PAIR(x, y, l) {
728                 _cleanup_free_ char *where = NULL;
729                 struct stat source_st, dest_st;
730                 int r;
731
732                 if (stat(*x, &source_st) < 0) {
733                         log_error("Failed to stat %s: %m", *x);
734                         return -errno;
735                 }
736
737                 where = strappend(dest, *y);
738                 if (!where)
739                         return log_oom();
740
741                 r = stat(where, &dest_st);
742                 if (r == 0) {
743                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
744                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
745                                 return -EINVAL;
746                         }
747                 } else if (errno == ENOENT) {
748                         r = mkdir_parents_label(where, 0755);
749                         if (r < 0) {
750                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
751                                 return r;
752                         }
753                 } else {
754                         log_error("Failed to bind mount %s: %m", *x);
755                         return -errno;
756                 }
757
758                 /* Create the mount point, but be conservative -- refuse to create block
759                  * and char devices. */
760                 if (S_ISDIR(source_st.st_mode)) {
761                         r = mkdir_label(where, 0755);
762                         if (r < 0 && errno != EEXIST) {
763                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
764
765                                 return r;
766                         }
767                 } else if (S_ISFIFO(source_st.st_mode)) {
768                         r = mkfifo(where, 0644);
769                         if (r < 0 && errno != EEXIST) {
770                                 log_error("Failed to create mount point %s: %m", where);
771
772                                 return -errno;
773                         }
774                 } else if (S_ISSOCK(source_st.st_mode)) {
775                         r = mknod(where, 0644 | S_IFSOCK, 0);
776                         if (r < 0 && errno != EEXIST) {
777                                 log_error("Failed to create mount point %s: %m", where);
778
779                                 return -errno;
780                         }
781                 } else if (S_ISREG(source_st.st_mode)) {
782                         r = touch(where);
783                         if (r < 0) {
784                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
785
786                                 return r;
787                         }
788                 } else {
789                         log_error("Refusing to create mountpoint for file: %s", *x);
790                         return -ENOTSUP;
791                 }
792
793                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
794                         log_error("mount(%s) failed: %m", where);
795                         return -errno;
796                 }
797
798                 if (ro) {
799                         r = bind_remount_recursive(where, true);
800                         if (r < 0) {
801                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
802                                 return r;
803                         }
804                 }
805         }
806
807         return 0;
808 }
809
810 static int mount_tmpfs(const char *dest) {
811         char **i, **o;
812
813         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
814                 _cleanup_free_ char *where = NULL;
815                 int r;
816
817                 where = strappend(dest, *i);
818                 if (!where)
819                         return log_oom();
820
821                 r = mkdir_label(where, 0755);
822                 if (r < 0) {
823                         log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
824
825                         return r;
826                 }
827
828                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
829                         log_error("tmpfs mount to %s failed: %m", where);
830                         return -errno;
831                 }
832         }
833
834         return 0;
835 }
836
837 static int setup_timezone(const char *dest) {
838         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
839         char *z, *y;
840         int r;
841
842         assert(dest);
843
844         /* Fix the timezone, if possible */
845         r = readlink_malloc("/etc/localtime", &p);
846         if (r < 0) {
847                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
848                 return 0;
849         }
850
851         z = path_startswith(p, "../usr/share/zoneinfo/");
852         if (!z)
853                 z = path_startswith(p, "/usr/share/zoneinfo/");
854         if (!z) {
855                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
856                 return 0;
857         }
858
859         where = strappend(dest, "/etc/localtime");
860         if (!where)
861                 return log_oom();
862
863         r = readlink_malloc(where, &q);
864         if (r >= 0) {
865                 y = path_startswith(q, "../usr/share/zoneinfo/");
866                 if (!y)
867                         y = path_startswith(q, "/usr/share/zoneinfo/");
868
869                 /* Already pointing to the right place? Then do nothing .. */
870                 if (y && streq(y, z))
871                         return 0;
872         }
873
874         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
875         if (!check)
876                 return log_oom();
877
878         if (access(check, F_OK) < 0) {
879                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
880                 return 0;
881         }
882
883         what = strappend("../usr/share/zoneinfo/", z);
884         if (!what)
885                 return log_oom();
886
887         r = mkdir_parents(where, 0755);
888         if (r < 0) {
889                 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
890
891                 return 0;
892         }
893
894         r = unlink(where);
895         if (r < 0 && errno != ENOENT) {
896                 log_error("Failed to remove existing timezone info %s in container: %m", where);
897
898                 return 0;
899         }
900
901         if (symlink(what, where) < 0) {
902                 log_error("Failed to correct timezone of container: %m");
903                 return 0;
904         }
905
906         return 0;
907 }
908
909 static int setup_resolv_conf(const char *dest) {
910         _cleanup_free_ char *where = NULL;
911         int r;
912
913         assert(dest);
914
915         if (arg_private_network)
916                 return 0;
917
918         /* Fix resolv.conf, if possible */
919         where = strappend(dest, "/etc/resolv.conf");
920         if (!where)
921                 return log_oom();
922
923         /* We don't really care for the results of this really. If it
924          * fails, it fails, but meh... */
925         r = mkdir_parents(where, 0755);
926         if (r < 0) {
927                 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
928
929                 return 0;
930         }
931
932         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
933         if (r < 0) {
934                 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
935
936                 return 0;
937         }
938
939         return 0;
940 }
941
942 static int setup_volatile_state(const char *directory) {
943         const char *p;
944         int r;
945
946         assert(directory);
947
948         if (arg_volatile != VOLATILE_STATE)
949                 return 0;
950
951         /* --volatile=state means we simply overmount /var
952            with a tmpfs, and the rest read-only. */
953
954         r = bind_remount_recursive(directory, true);
955         if (r < 0) {
956                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
957                 return r;
958         }
959
960         p = strappenda(directory, "/var");
961         r = mkdir(p, 0755);
962         if (r < 0 && errno != EEXIST) {
963                 log_error("Failed to create %s: %m", directory);
964                 return -errno;
965         }
966
967         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
968                 log_error("Failed to mount tmpfs to /var: %m");
969                 return -errno;
970         }
971
972         return 0;
973 }
974
975 static int setup_volatile(const char *directory) {
976         bool tmpfs_mounted = false, bind_mounted = false;
977         char template[] = "/tmp/nspawn-volatile-XXXXXX";
978         const char *f, *t;
979         int r;
980
981         assert(directory);
982
983         if (arg_volatile != VOLATILE_YES)
984                 return 0;
985
986         /* --volatile=yes means we mount a tmpfs to the root dir, and
987            the original /usr to use inside it, and that read-only. */
988
989         if (!mkdtemp(template)) {
990                 log_error("Failed to create temporary directory: %m");
991                 return -errno;
992         }
993
994         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
995                 log_error("Failed to mount tmpfs for root directory: %m");
996                 r = -errno;
997                 goto fail;
998         }
999
1000         tmpfs_mounted = true;
1001
1002         f = strappenda(directory, "/usr");
1003         t = strappenda(template, "/usr");
1004
1005         r = mkdir(t, 0755);
1006         if (r < 0 && errno != EEXIST) {
1007                 log_error("Failed to create %s: %m", t);
1008                 r = -errno;
1009                 goto fail;
1010         }
1011
1012         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1013                 log_error("Failed to create /usr bind mount: %m");
1014                 r = -errno;
1015                 goto fail;
1016         }
1017
1018         bind_mounted = true;
1019
1020         r = bind_remount_recursive(t, true);
1021         if (r < 0) {
1022                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1023                 goto fail;
1024         }
1025
1026         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1027                 log_error("Failed to move root mount: %m");
1028                 r = -errno;
1029                 goto fail;
1030         }
1031
1032         rmdir(template);
1033
1034         return 0;
1035
1036 fail:
1037         if (bind_mounted)
1038                 umount(t);
1039         if (tmpfs_mounted)
1040                 umount(template);
1041         rmdir(template);
1042         return r;
1043 }
1044
1045 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1046
1047         snprintf(s, 37,
1048                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1049                  SD_ID128_FORMAT_VAL(id));
1050
1051         return s;
1052 }
1053
1054 static int setup_boot_id(const char *dest) {
1055         _cleanup_free_ char *from = NULL, *to = NULL;
1056         sd_id128_t rnd = {};
1057         char as_uuid[37];
1058         int r;
1059
1060         assert(dest);
1061
1062         if (arg_share_system)
1063                 return 0;
1064
1065         /* Generate a new randomized boot ID, so that each boot-up of
1066          * the container gets a new one */
1067
1068         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1069         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1070         if (!from || !to)
1071                 return log_oom();
1072
1073         r = sd_id128_randomize(&rnd);
1074         if (r < 0) {
1075                 log_error("Failed to generate random boot id: %s", strerror(-r));
1076                 return r;
1077         }
1078
1079         id128_format_as_uuid(rnd, as_uuid);
1080
1081         r = write_string_file(from, as_uuid);
1082         if (r < 0) {
1083                 log_error("Failed to write boot id: %s", strerror(-r));
1084                 return r;
1085         }
1086
1087         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1088                 log_error("Failed to bind mount boot id: %m");
1089                 r = -errno;
1090         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1091                 log_warning("Failed to make boot id read-only: %m");
1092
1093         unlink(from);
1094         return r;
1095 }
1096
1097 static int copy_devnodes(const char *dest) {
1098
1099         static const char devnodes[] =
1100                 "null\0"
1101                 "zero\0"
1102                 "full\0"
1103                 "random\0"
1104                 "urandom\0"
1105                 "tty\0"
1106                 "net/tun\0";
1107
1108         const char *d;
1109         int r = 0;
1110         _cleanup_umask_ mode_t u;
1111
1112         assert(dest);
1113
1114         u = umask(0000);
1115
1116         NULSTR_FOREACH(d, devnodes) {
1117                 _cleanup_free_ char *from = NULL, *to = NULL;
1118                 struct stat st;
1119
1120                 from = strappend("/dev/", d);
1121                 to = strjoin(dest, "/dev/", d, NULL);
1122                 if (!from || !to)
1123                         return log_oom();
1124
1125                 if (stat(from, &st) < 0) {
1126
1127                         if (errno != ENOENT) {
1128                                 log_error("Failed to stat %s: %m", from);
1129                                 return -errno;
1130                         }
1131
1132                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1133
1134                         log_error("%s is not a char or block device, cannot copy", from);
1135                         return -EIO;
1136
1137                 } else {
1138                         r = mkdir_parents(to, 0775);
1139                         if (r < 0) {
1140                                 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1141                                 return -r;
1142                         }
1143
1144                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1145                                 log_error("mknod(%s) failed: %m", dest);
1146                                 return  -errno;
1147                         }
1148                 }
1149         }
1150
1151         return r;
1152 }
1153
1154 static int setup_ptmx(const char *dest) {
1155         _cleanup_free_ char *p = NULL;
1156
1157         p = strappend(dest, "/dev/ptmx");
1158         if (!p)
1159                 return log_oom();
1160
1161         if (symlink("pts/ptmx", p) < 0) {
1162                 log_error("Failed to create /dev/ptmx symlink: %m");
1163                 return -errno;
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int setup_dev_console(const char *dest, const char *console) {
1170         _cleanup_umask_ mode_t u;
1171         const char *to;
1172         struct stat st;
1173         int r;
1174
1175         assert(dest);
1176         assert(console);
1177
1178         u = umask(0000);
1179
1180         if (stat("/dev/null", &st) < 0) {
1181                 log_error("Failed to stat /dev/null: %m");
1182                 return -errno;
1183         }
1184
1185         r = chmod_and_chown(console, 0600, 0, 0);
1186         if (r < 0) {
1187                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1188                 return r;
1189         }
1190
1191         /* We need to bind mount the right tty to /dev/console since
1192          * ptys can only exist on pts file systems. To have something
1193          * to bind mount things on we create a device node first, and
1194          * use /dev/null for that since we the cgroups device policy
1195          * allows us to create that freely, while we cannot create
1196          * /dev/console. (Note that the major minor doesn't actually
1197          * matter here, since we mount it over anyway). */
1198
1199         to = strappenda(dest, "/dev/console");
1200         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1201                 log_error("mknod() for /dev/console failed: %m");
1202                 return -errno;
1203         }
1204
1205         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1206                 log_error("Bind mount for /dev/console failed: %m");
1207                 return -errno;
1208         }
1209
1210         return 0;
1211 }
1212
1213 static int setup_kmsg(const char *dest, int kmsg_socket) {
1214         _cleanup_free_ char *from = NULL, *to = NULL;
1215         int r, fd, k;
1216         _cleanup_umask_ mode_t u;
1217         union {
1218                 struct cmsghdr cmsghdr;
1219                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1220         } control = {};
1221         struct msghdr mh = {
1222                 .msg_control = &control,
1223                 .msg_controllen = sizeof(control),
1224         };
1225         struct cmsghdr *cmsg;
1226
1227         assert(dest);
1228         assert(kmsg_socket >= 0);
1229
1230         u = umask(0000);
1231
1232         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1233          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1234          * on the reading side behave very similar to /proc/kmsg,
1235          * their writing side behaves differently from /dev/kmsg in
1236          * that writing blocks when nothing is reading. In order to
1237          * avoid any problems with containers deadlocking due to this
1238          * we simply make /dev/kmsg unavailable to the container. */
1239         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1240             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1241                 return log_oom();
1242
1243         if (mkfifo(from, 0600) < 0) {
1244                 log_error("mkfifo() for /dev/kmsg failed: %m");
1245                 return -errno;
1246         }
1247
1248         r = chmod_and_chown(from, 0600, 0, 0);
1249         if (r < 0) {
1250                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1251                 return r;
1252         }
1253
1254         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1255                 log_error("Bind mount for /proc/kmsg failed: %m");
1256                 return -errno;
1257         }
1258
1259         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1260         if (fd < 0) {
1261                 log_error("Failed to open fifo: %m");
1262                 return -errno;
1263         }
1264
1265         cmsg = CMSG_FIRSTHDR(&mh);
1266         cmsg->cmsg_level = SOL_SOCKET;
1267         cmsg->cmsg_type = SCM_RIGHTS;
1268         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1269         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1270
1271         mh.msg_controllen = cmsg->cmsg_len;
1272
1273         /* Store away the fd in the socket, so that it stays open as
1274          * long as we run the child */
1275         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1276         safe_close(fd);
1277
1278         if (k < 0) {
1279                 log_error("Failed to send FIFO fd: %m");
1280                 return -errno;
1281         }
1282
1283         /* And now make the FIFO unavailable as /dev/kmsg... */
1284         unlink(from);
1285         return 0;
1286 }
1287
1288 static int setup_hostname(void) {
1289
1290         if (arg_share_system)
1291                 return 0;
1292
1293         if (sethostname_idempotent(arg_machine) < 0)
1294                 return -errno;
1295
1296         return 0;
1297 }
1298
1299 static int setup_journal(const char *directory) {
1300         sd_id128_t machine_id, this_id;
1301         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1302         char *id;
1303         int r;
1304
1305         p = strappend(directory, "/etc/machine-id");
1306         if (!p)
1307                 return log_oom();
1308
1309         r = read_one_line_file(p, &b);
1310         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1311                 return 0;
1312         else if (r < 0) {
1313                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1314                 return r;
1315         }
1316
1317         id = strstrip(b);
1318         if (isempty(id) && arg_link_journal == LINK_AUTO)
1319                 return 0;
1320
1321         /* Verify validity */
1322         r = sd_id128_from_string(id, &machine_id);
1323         if (r < 0) {
1324                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1325                 return r;
1326         }
1327
1328         r = sd_id128_get_machine(&this_id);
1329         if (r < 0) {
1330                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1331                 return r;
1332         }
1333
1334         if (sd_id128_equal(machine_id, this_id)) {
1335                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1336                          "Host and machine ids are equal (%s): refusing to link journals", id);
1337                 if (arg_link_journal == LINK_AUTO)
1338                         return 0;
1339                 return
1340                         -EEXIST;
1341         }
1342
1343         if (arg_link_journal == LINK_NO)
1344                 return 0;
1345
1346         free(p);
1347         p = strappend("/var/log/journal/", id);
1348         q = strjoin(directory, "/var/log/journal/", id, NULL);
1349         if (!p || !q)
1350                 return log_oom();
1351
1352         if (path_is_mount_point(p, false) > 0) {
1353                 if (arg_link_journal != LINK_AUTO) {
1354                         log_error("%s: already a mount point, refusing to use for journal", p);
1355                         return -EEXIST;
1356                 }
1357
1358                 return 0;
1359         }
1360
1361         if (path_is_mount_point(q, false) > 0) {
1362                 if (arg_link_journal != LINK_AUTO) {
1363                         log_error("%s: already a mount point, refusing to use for journal", q);
1364                         return -EEXIST;
1365                 }
1366
1367                 return 0;
1368         }
1369
1370         r = readlink_and_make_absolute(p, &d);
1371         if (r >= 0) {
1372                 if ((arg_link_journal == LINK_GUEST ||
1373                      arg_link_journal == LINK_AUTO) &&
1374                     path_equal(d, q)) {
1375
1376                         r = mkdir_p(q, 0755);
1377                         if (r < 0)
1378                                 log_warning("Failed to create directory %s: %m", q);
1379                         return 0;
1380                 }
1381
1382                 if (unlink(p) < 0) {
1383                         log_error("Failed to remove symlink %s: %m", p);
1384                         return -errno;
1385                 }
1386         } else if (r == -EINVAL) {
1387
1388                 if (arg_link_journal == LINK_GUEST &&
1389                     rmdir(p) < 0) {
1390
1391                         if (errno == ENOTDIR) {
1392                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1393                                 return r;
1394                         } else {
1395                                 log_error("Failed to remove %s: %m", p);
1396                                 return -errno;
1397                         }
1398                 }
1399         } else if (r != -ENOENT) {
1400                 log_error("readlink(%s) failed: %m", p);
1401                 return r;
1402         }
1403
1404         if (arg_link_journal == LINK_GUEST) {
1405
1406                 if (symlink(q, p) < 0) {
1407                         log_error("Failed to symlink %s to %s: %m", q, p);
1408                         return -errno;
1409                 }
1410
1411                 r = mkdir_p(q, 0755);
1412                 if (r < 0)
1413                         log_warning("Failed to create directory %s: %m", q);
1414                 return 0;
1415         }
1416
1417         if (arg_link_journal == LINK_HOST) {
1418                 r = mkdir_p(p, 0755);
1419                 if (r < 0) {
1420                         log_error("Failed to create %s: %m", p);
1421                         return r;
1422                 }
1423
1424         } else if (access(p, F_OK) < 0)
1425                 return 0;
1426
1427         if (dir_is_empty(q) == 0)
1428                 log_warning("%s is not empty, proceeding anyway.", q);
1429
1430         r = mkdir_p(q, 0755);
1431         if (r < 0) {
1432                 log_error("Failed to create %s: %m", q);
1433                 return r;
1434         }
1435
1436         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1437                 log_error("Failed to bind mount journal from host into guest: %m");
1438                 return -errno;
1439         }
1440
1441         return 0;
1442 }
1443
1444 static int drop_capabilities(void) {
1445         return capability_bounding_set_drop(~arg_retain, false);
1446 }
1447
1448 static int register_machine(pid_t pid, int local_ifindex) {
1449         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1450         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1451         int r;
1452
1453         if (!arg_register)
1454                 return 0;
1455
1456         r = sd_bus_default_system(&bus);
1457         if (r < 0) {
1458                 log_error("Failed to open system bus: %s", strerror(-r));
1459                 return r;
1460         }
1461
1462         if (arg_keep_unit) {
1463                 r = sd_bus_call_method(
1464                                 bus,
1465                                 "org.freedesktop.machine1",
1466                                 "/org/freedesktop/machine1",
1467                                 "org.freedesktop.machine1.Manager",
1468                                 "RegisterMachineWithNetwork",
1469                                 &error,
1470                                 NULL,
1471                                 "sayssusai",
1472                                 arg_machine,
1473                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1474                                 "nspawn",
1475                                 "container",
1476                                 (uint32_t) pid,
1477                                 strempty(arg_directory),
1478                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1479         } else {
1480                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1481
1482                 r = sd_bus_message_new_method_call(
1483                                 bus,
1484                                 &m,
1485                                 "org.freedesktop.machine1",
1486                                 "/org/freedesktop/machine1",
1487                                 "org.freedesktop.machine1.Manager",
1488                                 "CreateMachineWithNetwork");
1489                 if (r < 0) {
1490                         log_error("Failed to create message: %s", strerror(-r));
1491                         return r;
1492                 }
1493
1494                 r = sd_bus_message_append(
1495                                 m,
1496                                 "sayssusai",
1497                                 arg_machine,
1498                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1499                                 "nspawn",
1500                                 "container",
1501                                 (uint32_t) pid,
1502                                 strempty(arg_directory),
1503                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1504                 if (r < 0) {
1505                         log_error("Failed to append message arguments: %s", strerror(-r));
1506                         return r;
1507                 }
1508
1509                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1510                 if (r < 0) {
1511                         log_error("Failed to open container: %s", strerror(-r));
1512                         return r;
1513                 }
1514
1515                 if (!isempty(arg_slice)) {
1516                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1517                         if (r < 0) {
1518                                 log_error("Failed to append slice: %s", strerror(-r));
1519                                 return r;
1520                         }
1521                 }
1522
1523                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1524                 if (r < 0) {
1525                         log_error("Failed to add device policy: %s", strerror(-r));
1526                         return r;
1527                 }
1528
1529                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1530                                           /* Allow the container to
1531                                            * access and create the API
1532                                            * device nodes, so that
1533                                            * PrivateDevices= in the
1534                                            * container can work
1535                                            * fine */
1536                                           "/dev/null", "rwm",
1537                                           "/dev/zero", "rwm",
1538                                           "/dev/full", "rwm",
1539                                           "/dev/random", "rwm",
1540                                           "/dev/urandom", "rwm",
1541                                           "/dev/tty", "rwm",
1542                                           "/dev/net/tun", "rwm",
1543                                           /* Allow the container
1544                                            * access to ptys. However,
1545                                            * do not permit the
1546                                            * container to ever create
1547                                            * these device nodes. */
1548                                           "/dev/pts/ptmx", "rw",
1549                                           "char-pts", "rw");
1550                 if (r < 0) {
1551                         log_error("Failed to add device whitelist: %s", strerror(-r));
1552                         return r;
1553                 }
1554
1555                 r = sd_bus_message_close_container(m);
1556                 if (r < 0) {
1557                         log_error("Failed to close container: %s", strerror(-r));
1558                         return r;
1559                 }
1560
1561                 r = sd_bus_call(bus, m, 0, &error, NULL);
1562         }
1563
1564         if (r < 0) {
1565                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1566                 return r;
1567         }
1568
1569         return 0;
1570 }
1571
1572 static int terminate_machine(pid_t pid) {
1573         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1574         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1575         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1576         const char *path;
1577         int r;
1578
1579         if (!arg_register)
1580                 return 0;
1581
1582         r = sd_bus_default_system(&bus);
1583         if (r < 0) {
1584                 log_error("Failed to open system bus: %s", strerror(-r));
1585                 return r;
1586         }
1587
1588         r = sd_bus_call_method(
1589                         bus,
1590                         "org.freedesktop.machine1",
1591                         "/org/freedesktop/machine1",
1592                         "org.freedesktop.machine1.Manager",
1593                         "GetMachineByPID",
1594                         &error,
1595                         &reply,
1596                         "u",
1597                         (uint32_t) pid);
1598         if (r < 0) {
1599                 /* Note that the machine might already have been
1600                  * cleaned up automatically, hence don't consider it a
1601                  * failure if we cannot get the machine object. */
1602                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1603                 return 0;
1604         }
1605
1606         r = sd_bus_message_read(reply, "o", &path);
1607         if (r < 0)
1608                 return bus_log_parse_error(r);
1609
1610         r = sd_bus_call_method(
1611                         bus,
1612                         "org.freedesktop.machine1",
1613                         path,
1614                         "org.freedesktop.machine1.Machine",
1615                         "Terminate",
1616                         &error,
1617                         NULL,
1618                         NULL);
1619         if (r < 0) {
1620                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1621                 return 0;
1622         }
1623
1624         return 0;
1625 }
1626
1627 static int reset_audit_loginuid(void) {
1628         _cleanup_free_ char *p = NULL;
1629         int r;
1630
1631         if (arg_share_system)
1632                 return 0;
1633
1634         r = read_one_line_file("/proc/self/loginuid", &p);
1635         if (r == -ENOENT)
1636                 return 0;
1637         if (r < 0) {
1638                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1639                 return r;
1640         }
1641
1642         /* Already reset? */
1643         if (streq(p, "4294967295"))
1644                 return 0;
1645
1646         r = write_string_file("/proc/self/loginuid", "4294967295");
1647         if (r < 0) {
1648                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1649                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1650                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1651                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1652                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1653
1654                 sleep(5);
1655         }
1656
1657         return 0;
1658 }
1659
1660 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1661 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1662
1663 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1664         int r;
1665
1666         uint8_t result[8];
1667         size_t l, sz;
1668         uint8_t *v;
1669
1670         l = strlen(arg_machine);
1671         sz = sizeof(sd_id128_t) + l;
1672         v = alloca(sz);
1673
1674         /* fetch some persistent data unique to the host */
1675         r = sd_id128_get_machine((sd_id128_t*) v);
1676         if (r < 0)
1677                 return r;
1678
1679         /* combine with some data unique (on this host) to this
1680          * container instance */
1681         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1682
1683         /* Let's hash the host machine ID plus the container name. We
1684          * use a fixed, but originally randomly created hash key here. */
1685         siphash24(result, v, sz, hash_key.bytes);
1686
1687         assert_cc(ETH_ALEN <= sizeof(result));
1688         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1689
1690         /* see eth_random_addr in the kernel */
1691         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1692         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1693
1694         return 0;
1695 }
1696
1697 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1698         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1699         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1700         struct ether_addr mac_host, mac_container;
1701         int r, i;
1702
1703         if (!arg_private_network)
1704                 return 0;
1705
1706         if (!arg_network_veth)
1707                 return 0;
1708
1709         /* Use two different interface name prefixes depending whether
1710          * we are in bridge mode or not. */
1711         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1712                  arg_network_bridge ? "vb" : "ve", arg_machine);
1713
1714         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1715         if (r < 0) {
1716                 log_error("Failed to generate predictable MAC address for container side");
1717                 return r;
1718         }
1719
1720         r = generate_mac(&mac_host, HOST_HASH_KEY);
1721         if (r < 0) {
1722                 log_error("Failed to generate predictable MAC address for host side");
1723                 return r;
1724         }
1725
1726         r = sd_rtnl_open(&rtnl, 0);
1727         if (r < 0) {
1728                 log_error("Failed to connect to netlink: %s", strerror(-r));
1729                 return r;
1730         }
1731
1732         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1733         if (r < 0) {
1734                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1735                 return r;
1736         }
1737
1738         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1739         if (r < 0) {
1740                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1741                 return r;
1742         }
1743
1744         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1745         if (r < 0) {
1746                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1747                 return r;
1748         }
1749
1750         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1751         if (r < 0) {
1752                 log_error("Failed to open netlink container: %s", strerror(-r));
1753                 return r;
1754         }
1755
1756         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1757         if (r < 0) {
1758                 log_error("Failed to open netlink container: %s", strerror(-r));
1759                 return r;
1760         }
1761
1762         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1763         if (r < 0) {
1764                 log_error("Failed to open netlink container: %s", strerror(-r));
1765                 return r;
1766         }
1767
1768         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1769         if (r < 0) {
1770                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1771                 return r;
1772         }
1773
1774         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1775         if (r < 0) {
1776                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1777                 return r;
1778         }
1779
1780         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1781         if (r < 0) {
1782                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1783                 return r;
1784         }
1785
1786         r = sd_rtnl_message_close_container(m);
1787         if (r < 0) {
1788                 log_error("Failed to close netlink container: %s", strerror(-r));
1789                 return r;
1790         }
1791
1792         r = sd_rtnl_message_close_container(m);
1793         if (r < 0) {
1794                 log_error("Failed to close netlink container: %s", strerror(-r));
1795                 return r;
1796         }
1797
1798         r = sd_rtnl_message_close_container(m);
1799         if (r < 0) {
1800                 log_error("Failed to close netlink container: %s", strerror(-r));
1801                 return r;
1802         }
1803
1804         r = sd_rtnl_call(rtnl, m, 0, NULL);
1805         if (r < 0) {
1806                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1807                 return r;
1808         }
1809
1810         i = (int) if_nametoindex(iface_name);
1811         if (i <= 0) {
1812                 log_error("Failed to resolve interface %s: %m", iface_name);
1813                 return -errno;
1814         }
1815
1816         *ifi = i;
1817
1818         return 0;
1819 }
1820
1821 static int setup_bridge(const char veth_name[], int *ifi) {
1822         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1823         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1824         int r, bridge;
1825
1826         if (!arg_private_network)
1827                 return 0;
1828
1829         if (!arg_network_veth)
1830                 return 0;
1831
1832         if (!arg_network_bridge)
1833                 return 0;
1834
1835         bridge = (int) if_nametoindex(arg_network_bridge);
1836         if (bridge <= 0) {
1837                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1838                 return -errno;
1839         }
1840
1841         *ifi = bridge;
1842
1843         r = sd_rtnl_open(&rtnl, 0);
1844         if (r < 0) {
1845                 log_error("Failed to connect to netlink: %s", strerror(-r));
1846                 return r;
1847         }
1848
1849         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1850         if (r < 0) {
1851                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1852                 return r;
1853         }
1854
1855         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1856         if (r < 0) {
1857                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1858                 return r;
1859         }
1860
1861         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1862         if (r < 0) {
1863                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1864                 return r;
1865         }
1866
1867         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1868         if (r < 0) {
1869                 log_error("Failed to add netlink master field: %s", strerror(-r));
1870                 return r;
1871         }
1872
1873         r = sd_rtnl_call(rtnl, m, 0, NULL);
1874         if (r < 0) {
1875                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1876                 return r;
1877         }
1878
1879         return 0;
1880 }
1881
1882 static int parse_interface(struct udev *udev, const char *name) {
1883         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1884         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1885         int ifi;
1886
1887         ifi = (int) if_nametoindex(name);
1888         if (ifi <= 0) {
1889                 log_error("Failed to resolve interface %s: %m", name);
1890                 return -errno;
1891         }
1892
1893         sprintf(ifi_str, "n%i", ifi);
1894         d = udev_device_new_from_device_id(udev, ifi_str);
1895         if (!d) {
1896                 log_error("Failed to get udev device for interface %s: %m", name);
1897                 return -errno;
1898         }
1899
1900         if (udev_device_get_is_initialized(d) <= 0) {
1901                 log_error("Network interface %s is not initialized yet.", name);
1902                 return -EBUSY;
1903         }
1904
1905         return ifi;
1906 }
1907
1908 static int move_network_interfaces(pid_t pid) {
1909         _cleanup_udev_unref_ struct udev *udev = NULL;
1910         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1911         char **i;
1912         int r;
1913
1914         if (!arg_private_network)
1915                 return 0;
1916
1917         if (strv_isempty(arg_network_interfaces))
1918                 return 0;
1919
1920         r = sd_rtnl_open(&rtnl, 0);
1921         if (r < 0) {
1922                 log_error("Failed to connect to netlink: %s", strerror(-r));
1923                 return r;
1924         }
1925
1926         udev = udev_new();
1927         if (!udev) {
1928                 log_error("Failed to connect to udev.");
1929                 return -ENOMEM;
1930         }
1931
1932         STRV_FOREACH(i, arg_network_interfaces) {
1933                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1934                 int ifi;
1935
1936                 ifi = parse_interface(udev, *i);
1937                 if (ifi < 0)
1938                         return ifi;
1939
1940                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1941                 if (r < 0) {
1942                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1943                         return r;
1944                 }
1945
1946                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1947                 if (r < 0) {
1948                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1949                         return r;
1950                 }
1951
1952                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1953                 if (r < 0) {
1954                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1955                         return r;
1956                 }
1957         }
1958
1959         return 0;
1960 }
1961
1962 static int setup_macvlan(pid_t pid) {
1963         _cleanup_udev_unref_ struct udev *udev = NULL;
1964         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1965         char **i;
1966         int r;
1967
1968         if (!arg_private_network)
1969                 return 0;
1970
1971         if (strv_isempty(arg_network_macvlan))
1972                 return 0;
1973
1974         r = sd_rtnl_open(&rtnl, 0);
1975         if (r < 0) {
1976                 log_error("Failed to connect to netlink: %s", strerror(-r));
1977                 return r;
1978         }
1979
1980         udev = udev_new();
1981         if (!udev) {
1982                 log_error("Failed to connect to udev.");
1983                 return -ENOMEM;
1984         }
1985
1986         STRV_FOREACH(i, arg_network_macvlan) {
1987                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1988                 _cleanup_free_ char *n = NULL;
1989                 int ifi;
1990
1991                 ifi = parse_interface(udev, *i);
1992                 if (ifi < 0)
1993                         return ifi;
1994
1995                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1996                 if (r < 0) {
1997                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1998                         return r;
1999                 }
2000
2001                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2002                 if (r < 0) {
2003                         log_error("Failed to add netlink interface index: %s", strerror(-r));
2004                         return r;
2005                 }
2006
2007                 n = strappend("mv-", *i);
2008                 if (!n)
2009                         return log_oom();
2010
2011                 strshorten(n, IFNAMSIZ-1);
2012
2013                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2014                 if (r < 0) {
2015                         log_error("Failed to add netlink interface name: %s", strerror(-r));
2016                         return r;
2017                 }
2018
2019                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2020                 if (r < 0) {
2021                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
2022                         return r;
2023                 }
2024
2025                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2026                 if (r < 0) {
2027                         log_error("Failed to open netlink container: %s", strerror(-r));
2028                         return r;
2029                 }
2030
2031                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2032                 if (r < 0) {
2033                         log_error("Failed to open netlink container: %s", strerror(-r));
2034                         return r;
2035                 }
2036
2037                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2038                 if (r < 0) {
2039                         log_error("Failed to append macvlan mode: %s", strerror(-r));
2040                         return r;
2041                 }
2042
2043                 r = sd_rtnl_message_close_container(m);
2044                 if (r < 0) {
2045                         log_error("Failed to close netlink container: %s", strerror(-r));
2046                         return r;
2047                 }
2048
2049                 r = sd_rtnl_message_close_container(m);
2050                 if (r < 0) {
2051                         log_error("Failed to close netlink container: %s", strerror(-r));
2052                         return r;
2053                 }
2054
2055                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2056                 if (r < 0) {
2057                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2058                         return r;
2059                 }
2060         }
2061
2062         return 0;
2063 }
2064
2065 static int setup_seccomp(void) {
2066
2067 #ifdef HAVE_SECCOMP
2068         static const int blacklist[] = {
2069                 SCMP_SYS(kexec_load),
2070                 SCMP_SYS(open_by_handle_at),
2071                 SCMP_SYS(init_module),
2072                 SCMP_SYS(finit_module),
2073                 SCMP_SYS(delete_module),
2074                 SCMP_SYS(iopl),
2075                 SCMP_SYS(ioperm),
2076                 SCMP_SYS(swapon),
2077                 SCMP_SYS(swapoff),
2078         };
2079
2080         scmp_filter_ctx seccomp;
2081         unsigned i;
2082         int r;
2083
2084         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2085         if (!seccomp)
2086                 return log_oom();
2087
2088         r = seccomp_add_secondary_archs(seccomp);
2089         if (r < 0) {
2090                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2091                 goto finish;
2092         }
2093
2094         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2095                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2096                 if (r == -EFAULT)
2097                         continue; /* unknown syscall */
2098                 if (r < 0) {
2099                         log_error("Failed to block syscall: %s", strerror(-r));
2100                         goto finish;
2101                 }
2102         }
2103
2104         /*
2105            Audit is broken in containers, much of the userspace audit
2106            hookup will fail if running inside a container. We don't
2107            care and just turn off creation of audit sockets.
2108
2109            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2110            with EAFNOSUPPORT which audit userspace uses as indication
2111            that audit is disabled in the kernel.
2112          */
2113
2114         r = seccomp_rule_add(
2115                         seccomp,
2116                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2117                         SCMP_SYS(socket),
2118                         2,
2119                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2120                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2121         if (r < 0) {
2122                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2123                 goto finish;
2124         }
2125
2126         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2127         if (r < 0) {
2128                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2129                 goto finish;
2130         }
2131
2132         r = seccomp_load(seccomp);
2133         if (r < 0)
2134                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2135
2136 finish:
2137         seccomp_release(seccomp);
2138         return r;
2139 #else
2140         return 0;
2141 #endif
2142
2143 }
2144
2145 static int setup_image(char **device_path, int *loop_nr) {
2146         struct loop_info64 info = {
2147                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2148         };
2149         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2150         _cleanup_free_ char* loopdev = NULL;
2151         struct stat st;
2152         int r, nr;
2153
2154         assert(device_path);
2155         assert(loop_nr);
2156
2157         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2158         if (fd < 0) {
2159                 log_error("Failed to open %s: %m", arg_image);
2160                 return -errno;
2161         }
2162
2163         if (fstat(fd, &st) < 0) {
2164                 log_error("Failed to stat %s: %m", arg_image);
2165                 return -errno;
2166         }
2167
2168         if (S_ISBLK(st.st_mode)) {
2169                 char *p;
2170
2171                 p = strdup(arg_image);
2172                 if (!p)
2173                         return log_oom();
2174
2175                 *device_path = p;
2176
2177                 *loop_nr = -1;
2178
2179                 r = fd;
2180                 fd = -1;
2181
2182                 return r;
2183         }
2184
2185         if (!S_ISREG(st.st_mode)) {
2186                 log_error("%s is not a regular file or block device: %m", arg_image);
2187                 return -EINVAL;
2188         }
2189
2190         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2191         if (control < 0) {
2192                 log_error("Failed to open /dev/loop-control: %m");
2193                 return -errno;
2194         }
2195
2196         nr = ioctl(control, LOOP_CTL_GET_FREE);
2197         if (nr < 0) {
2198                 log_error("Failed to allocate loop device: %m");
2199                 return -errno;
2200         }
2201
2202         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2203                 return log_oom();
2204
2205         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2206         if (loop < 0) {
2207                 log_error("Failed to open loop device %s: %m", loopdev);
2208                 return -errno;
2209         }
2210
2211         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2212                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2213                 return -errno;
2214         }
2215
2216         if (arg_read_only)
2217                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2218
2219         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2220                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2221                 return -errno;
2222         }
2223
2224         *device_path = loopdev;
2225         loopdev = NULL;
2226
2227         *loop_nr = nr;
2228
2229         r = loop;
2230         loop = -1;
2231
2232         return r;
2233 }
2234
2235 static int dissect_image(
2236                 int fd,
2237                 char **root_device, bool *root_device_rw,
2238                 char **home_device, bool *home_device_rw,
2239                 char **srv_device, bool *srv_device_rw,
2240                 bool *secondary) {
2241
2242 #ifdef HAVE_BLKID
2243         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2244         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2245         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2246         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2247         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2248         _cleanup_udev_unref_ struct udev *udev = NULL;
2249         struct udev_list_entry *first, *item;
2250         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2251         const char *pttype = NULL;
2252         blkid_partlist pl;
2253         struct stat st;
2254         int r;
2255
2256         assert(fd >= 0);
2257         assert(root_device);
2258         assert(home_device);
2259         assert(srv_device);
2260         assert(secondary);
2261
2262         b = blkid_new_probe();
2263         if (!b)
2264                 return log_oom();
2265
2266         errno = 0;
2267         r = blkid_probe_set_device(b, fd, 0, 0);
2268         if (r != 0) {
2269                 if (errno == 0)
2270                         return log_oom();
2271
2272                 log_error("Failed to set device on blkid probe: %m");
2273                 return -errno;
2274         }
2275
2276         blkid_probe_enable_partitions(b, 1);
2277         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2278
2279         errno = 0;
2280         r = blkid_do_safeprobe(b);
2281         if (r == -2 || r == 1) {
2282                 log_error("Failed to identify any partition table on %s.\n"
2283                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2284                 return -EINVAL;
2285         } else if (r != 0) {
2286                 if (errno == 0)
2287                         errno = EIO;
2288                 log_error("Failed to probe: %m");
2289                 return -errno;
2290         }
2291
2292         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2293         if (!streq_ptr(pttype, "gpt")) {
2294                 log_error("Image %s does not carry a GUID Partition Table.\n"
2295                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2296                 return -EINVAL;
2297         }
2298
2299         errno = 0;
2300         pl = blkid_probe_get_partitions(b);
2301         if (!pl) {
2302                 if (errno == 0)
2303                         return log_oom();
2304
2305                 log_error("Failed to list partitions of %s", arg_image);
2306                 return -errno;
2307         }
2308
2309         udev = udev_new();
2310         if (!udev)
2311                 return log_oom();
2312
2313         if (fstat(fd, &st) < 0) {
2314                 log_error("Failed to stat block device: %m");
2315                 return -errno;
2316         }
2317
2318         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2319         if (!d)
2320                 return log_oom();
2321
2322         e = udev_enumerate_new(udev);
2323         if (!e)
2324                 return log_oom();
2325
2326         r = udev_enumerate_add_match_parent(e, d);
2327         if (r < 0)
2328                 return log_oom();
2329
2330         r = udev_enumerate_scan_devices(e);
2331         if (r < 0) {
2332                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2333                 return r;
2334         }
2335
2336         first = udev_enumerate_get_list_entry(e);
2337         udev_list_entry_foreach(item, first) {
2338                 _cleanup_udev_device_unref_ struct udev_device *q;
2339                 const char *stype, *node;
2340                 unsigned long long flags;
2341                 sd_id128_t type_id;
2342                 blkid_partition pp;
2343                 dev_t qn;
2344                 int nr;
2345
2346                 errno = 0;
2347                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2348                 if (!q) {
2349                         if (!errno)
2350                                 errno = ENOMEM;
2351
2352                         log_error("Failed to get partition device of %s: %m", arg_image);
2353                         return -errno;
2354                 }
2355
2356                 qn = udev_device_get_devnum(q);
2357                 if (major(qn) == 0)
2358                         continue;
2359
2360                 if (st.st_rdev == qn)
2361                         continue;
2362
2363                 node = udev_device_get_devnode(q);
2364                 if (!node)
2365                         continue;
2366
2367                 pp = blkid_partlist_devno_to_partition(pl, qn);
2368                 if (!pp)
2369                         continue;
2370
2371                 flags = blkid_partition_get_flags(pp);
2372                 if (flags & GPT_FLAG_NO_AUTO)
2373                         continue;
2374
2375                 nr = blkid_partition_get_partno(pp);
2376                 if (nr < 0)
2377                         continue;
2378
2379                 stype = blkid_partition_get_type_string(pp);
2380                 if (!stype)
2381                         continue;
2382
2383                 if (sd_id128_from_string(stype, &type_id) < 0)
2384                         continue;
2385
2386                 if (sd_id128_equal(type_id, GPT_HOME)) {
2387
2388                         if (home && nr >= home_nr)
2389                                 continue;
2390
2391                         home_nr = nr;
2392                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2393
2394                         free(home);
2395                         home = strdup(node);
2396                         if (!home)
2397                                 return log_oom();
2398                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2399
2400                         if (srv && nr >= srv_nr)
2401                                 continue;
2402
2403                         srv_nr = nr;
2404                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2405
2406                         free(srv);
2407                         srv = strdup(node);
2408                         if (!srv)
2409                                 return log_oom();
2410                 }
2411 #ifdef GPT_ROOT_NATIVE
2412                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2413
2414                         if (root && nr >= root_nr)
2415                                 continue;
2416
2417                         root_nr = nr;
2418                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2419
2420                         free(root);
2421                         root = strdup(node);
2422                         if (!root)
2423                                 return log_oom();
2424                 }
2425 #endif
2426 #ifdef GPT_ROOT_SECONDARY
2427                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2428
2429                         if (secondary_root && nr >= secondary_root_nr)
2430                                 continue;
2431
2432                         secondary_root_nr = nr;
2433                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2434
2435
2436                         free(secondary_root);
2437                         secondary_root = strdup(node);
2438                         if (!secondary_root)
2439                                 return log_oom();
2440                 }
2441 #endif
2442         }
2443
2444         if (!root && !secondary_root) {
2445                 log_error("Failed to identify root partition in disk image %s.\n"
2446                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2447                 return -EINVAL;
2448         }
2449
2450         if (root) {
2451                 *root_device = root;
2452                 root = NULL;
2453
2454                 *root_device_rw = root_rw;
2455                 *secondary = false;
2456         } else if (secondary_root) {
2457                 *root_device = secondary_root;
2458                 secondary_root = NULL;
2459
2460                 *root_device_rw = secondary_root_rw;
2461                 *secondary = true;
2462         }
2463
2464         if (home) {
2465                 *home_device = home;
2466                 home = NULL;
2467
2468                 *home_device_rw = home_rw;
2469         }
2470
2471         if (srv) {
2472                 *srv_device = srv;
2473                 srv = NULL;
2474
2475                 *srv_device_rw = srv_rw;
2476         }
2477
2478         return 0;
2479 #else
2480         log_error("--image= is not supported, compiled without blkid support.");
2481         return -ENOTSUP;
2482 #endif
2483 }
2484
2485 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2486 #ifdef HAVE_BLKID
2487         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2488         const char *fstype, *p;
2489         int r;
2490
2491         assert(what);
2492         assert(where);
2493
2494         if (arg_read_only)
2495                 rw = false;
2496
2497         if (directory)
2498                 p = strappenda(where, directory);
2499         else
2500                 p = where;
2501
2502         errno = 0;
2503         b = blkid_new_probe_from_filename(what);
2504         if (!b) {
2505                 if (errno == 0)
2506                         return log_oom();
2507                 log_error("Failed to allocate prober for %s: %m", what);
2508                 return -errno;
2509         }
2510
2511         blkid_probe_enable_superblocks(b, 1);
2512         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2513
2514         errno = 0;
2515         r = blkid_do_safeprobe(b);
2516         if (r == -1 || r == 1) {
2517                 log_error("Cannot determine file system type of %s", what);
2518                 return -EINVAL;
2519         } else if (r != 0) {
2520                 if (errno == 0)
2521                         errno = EIO;
2522                 log_error("Failed to probe %s: %m", what);
2523                 return -errno;
2524         }
2525
2526         errno = 0;
2527         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2528                 if (errno == 0)
2529                         errno = EINVAL;
2530                 log_error("Failed to determine file system type of %s", what);
2531                 return -errno;
2532         }
2533
2534         if (streq(fstype, "crypto_LUKS")) {
2535                 log_error("nspawn currently does not support LUKS disk images.");
2536                 return -ENOTSUP;
2537         }
2538
2539         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2540                 log_error("Failed to mount %s: %m", what);
2541                 return -errno;
2542         }
2543
2544         return 0;
2545 #else
2546         log_error("--image= is not supported, compiled without blkid support.");
2547         return -ENOTSUP;
2548 #endif
2549 }
2550
2551 static int mount_devices(
2552                 const char *where,
2553                 const char *root_device, bool root_device_rw,
2554                 const char *home_device, bool home_device_rw,
2555                 const char *srv_device, bool srv_device_rw) {
2556         int r;
2557
2558         assert(where);
2559
2560         if (root_device) {
2561                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2562                 if (r < 0) {
2563                         log_error("Failed to mount root directory: %s", strerror(-r));
2564                         return r;
2565                 }
2566         }
2567
2568         if (home_device) {
2569                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2570                 if (r < 0) {
2571                         log_error("Failed to mount home directory: %s", strerror(-r));
2572                         return r;
2573                 }
2574         }
2575
2576         if (srv_device) {
2577                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2578                 if (r < 0) {
2579                         log_error("Failed to mount server data directory: %s", strerror(-r));
2580                         return r;
2581                 }
2582         }
2583
2584         return 0;
2585 }
2586
2587 static void loop_remove(int nr, int *image_fd) {
2588         _cleanup_close_ int control = -1;
2589         int r;
2590
2591         if (nr < 0)
2592                 return;
2593
2594         if (image_fd && *image_fd >= 0) {
2595                 r = ioctl(*image_fd, LOOP_CLR_FD);
2596                 if (r < 0)
2597                         log_warning("Failed to close loop image: %m");
2598                 *image_fd = safe_close(*image_fd);
2599         }
2600
2601         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2602         if (control < 0) {
2603                 log_warning("Failed to open /dev/loop-control: %m");
2604                 return;
2605         }
2606
2607         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2608         if (r < 0)
2609                 log_warning("Failed to remove loop %d: %m", nr);
2610 }
2611
2612 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2613         int pipe_fds[2];
2614         pid_t pid;
2615
2616         assert(database);
2617         assert(key);
2618         assert(rpid);
2619
2620         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2621                 log_error("Failed to allocate pipe: %m");
2622                 return -errno;
2623         }
2624
2625         pid = fork();
2626         if (pid < 0) {
2627                 log_error("Failed to fork getent child: %m");
2628                 return -errno;
2629         } else if (pid == 0) {
2630                 int nullfd;
2631                 char *empty_env = NULL;
2632
2633                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2634                         _exit(EXIT_FAILURE);
2635
2636                 if (pipe_fds[0] > 2)
2637                         safe_close(pipe_fds[0]);
2638                 if (pipe_fds[1] > 2)
2639                         safe_close(pipe_fds[1]);
2640
2641                 nullfd = open("/dev/null", O_RDWR);
2642                 if (nullfd < 0)
2643                         _exit(EXIT_FAILURE);
2644
2645                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2646                         _exit(EXIT_FAILURE);
2647
2648                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2649                         _exit(EXIT_FAILURE);
2650
2651                 if (nullfd > 2)
2652                         safe_close(nullfd);
2653
2654                 reset_all_signal_handlers();
2655                 close_all_fds(NULL, 0);
2656
2657                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2658                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2659                 _exit(EXIT_FAILURE);
2660         }
2661
2662         pipe_fds[1] = safe_close(pipe_fds[1]);
2663
2664         *rpid = pid;
2665
2666         return pipe_fds[0];
2667 }
2668
2669 static int change_uid_gid(char **_home) {
2670         char line[LINE_MAX], *x, *u, *g, *h;
2671         const char *word, *state;
2672         _cleanup_free_ uid_t *uids = NULL;
2673         _cleanup_free_ char *home = NULL;
2674         _cleanup_fclose_ FILE *f = NULL;
2675         _cleanup_close_ int fd = -1;
2676         unsigned n_uids = 0;
2677         size_t sz = 0, l;
2678         uid_t uid;
2679         gid_t gid;
2680         pid_t pid;
2681         int r;
2682
2683         assert(_home);
2684
2685         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2686                 /* Reset everything fully to 0, just in case */
2687
2688                 if (setgroups(0, NULL) < 0) {
2689                         log_error("setgroups() failed: %m");
2690                         return -errno;
2691                 }
2692
2693                 if (setresgid(0, 0, 0) < 0) {
2694                         log_error("setregid() failed: %m");
2695                         return -errno;
2696                 }
2697
2698                 if (setresuid(0, 0, 0) < 0) {
2699                         log_error("setreuid() failed: %m");
2700                         return -errno;
2701                 }
2702
2703                 *_home = NULL;
2704                 return 0;
2705         }
2706
2707         /* First, get user credentials */
2708         fd = spawn_getent("passwd", arg_user, &pid);
2709         if (fd < 0)
2710                 return fd;
2711
2712         f = fdopen(fd, "r");
2713         if (!f)
2714                 return log_oom();
2715         fd = -1;
2716
2717         if (!fgets(line, sizeof(line), f)) {
2718
2719                 if (!ferror(f)) {
2720                         log_error("Failed to resolve user %s.", arg_user);
2721                         return -ESRCH;
2722                 }
2723
2724                 log_error("Failed to read from getent: %m");
2725                 return -errno;
2726         }
2727
2728         truncate_nl(line);
2729
2730         wait_for_terminate_and_warn("getent passwd", pid);
2731
2732         x = strchr(line, ':');
2733         if (!x) {
2734                 log_error("/etc/passwd entry has invalid user field.");
2735                 return -EIO;
2736         }
2737
2738         u = strchr(x+1, ':');
2739         if (!u) {
2740                 log_error("/etc/passwd entry has invalid password field.");
2741                 return -EIO;
2742         }
2743
2744         u++;
2745         g = strchr(u, ':');
2746         if (!g) {
2747                 log_error("/etc/passwd entry has invalid UID field.");
2748                 return -EIO;
2749         }
2750
2751         *g = 0;
2752         g++;
2753         x = strchr(g, ':');
2754         if (!x) {
2755                 log_error("/etc/passwd entry has invalid GID field.");
2756                 return -EIO;
2757         }
2758
2759         *x = 0;
2760         h = strchr(x+1, ':');
2761         if (!h) {
2762                 log_error("/etc/passwd entry has invalid GECOS field.");
2763                 return -EIO;
2764         }
2765
2766         h++;
2767         x = strchr(h, ':');
2768         if (!x) {
2769                 log_error("/etc/passwd entry has invalid home directory field.");
2770                 return -EIO;
2771         }
2772
2773         *x = 0;
2774
2775         r = parse_uid(u, &uid);
2776         if (r < 0) {
2777                 log_error("Failed to parse UID of user.");
2778                 return -EIO;
2779         }
2780
2781         r = parse_gid(g, &gid);
2782         if (r < 0) {
2783                 log_error("Failed to parse GID of user.");
2784                 return -EIO;
2785         }
2786
2787         home = strdup(h);
2788         if (!home)
2789                 return log_oom();
2790
2791         /* Second, get group memberships */
2792         fd = spawn_getent("initgroups", arg_user, &pid);
2793         if (fd < 0)
2794                 return fd;
2795
2796         fclose(f);
2797         f = fdopen(fd, "r");
2798         if (!f)
2799                 return log_oom();
2800         fd = -1;
2801
2802         if (!fgets(line, sizeof(line), f)) {
2803                 if (!ferror(f)) {
2804                         log_error("Failed to resolve user %s.", arg_user);
2805                         return -ESRCH;
2806                 }
2807
2808                 log_error("Failed to read from getent: %m");
2809                 return -errno;
2810         }
2811
2812         truncate_nl(line);
2813
2814         wait_for_terminate_and_warn("getent initgroups", pid);
2815
2816         /* Skip over the username and subsequent separator whitespace */
2817         x = line;
2818         x += strcspn(x, WHITESPACE);
2819         x += strspn(x, WHITESPACE);
2820
2821         FOREACH_WORD(word, l, x, state) {
2822                 char c[l+1];
2823
2824                 memcpy(c, word, l);
2825                 c[l] = 0;
2826
2827                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2828                         return log_oom();
2829
2830                 r = parse_uid(c, &uids[n_uids++]);
2831                 if (r < 0) {
2832                         log_error("Failed to parse group data from getent.");
2833                         return -EIO;
2834                 }
2835         }
2836
2837         r = mkdir_parents(home, 0775);
2838         if (r < 0) {
2839                 log_error("Failed to make home root directory: %s", strerror(-r));
2840                 return r;
2841         }
2842
2843         r = mkdir_safe(home, 0755, uid, gid);
2844         if (r < 0 && r != -EEXIST) {
2845                 log_error("Failed to make home directory: %s", strerror(-r));
2846                 return r;
2847         }
2848
2849         fchown(STDIN_FILENO, uid, gid);
2850         fchown(STDOUT_FILENO, uid, gid);
2851         fchown(STDERR_FILENO, uid, gid);
2852
2853         if (setgroups(n_uids, uids) < 0) {
2854                 log_error("Failed to set auxiliary groups: %m");
2855                 return -errno;
2856         }
2857
2858         if (setresgid(gid, gid, gid) < 0) {
2859                 log_error("setregid() failed: %m");
2860                 return -errno;
2861         }
2862
2863         if (setresuid(uid, uid, uid) < 0) {
2864                 log_error("setreuid() failed: %m");
2865                 return -errno;
2866         }
2867
2868         if (_home) {
2869                 *_home = home;
2870                 home = NULL;
2871         }
2872
2873         return 0;
2874 }
2875
2876 /*
2877  * Return values:
2878  * < 0 : wait_for_terminate() failed to get the state of the
2879  *       container, the container was terminated by a signal, or
2880  *       failed for an unknown reason.  No change is made to the
2881  *       container argument.
2882  * > 0 : The program executed in the container terminated with an
2883  *       error.  The exit code of the program executed in the
2884  *       container is returned.  The container argument has been set
2885  *       to CONTAINER_TERMINATED.
2886  *   0 : The container is being rebooted, has been shut down or exited
2887  *       successfully.  The container argument has been set to either
2888  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2889  *
2890  * That is, success is indicated by a return value of zero, and an
2891  * error is indicated by a non-zero value.
2892  */
2893 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2894         siginfo_t status;
2895         int r;
2896
2897         r = wait_for_terminate(pid, &status);
2898         if (r < 0) {
2899                 log_warning("Failed to wait for container: %s", strerror(-r));
2900                 return r;
2901         }
2902
2903         switch (status.si_code) {
2904
2905         case CLD_EXITED:
2906                 if (status.si_status == 0) {
2907                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2908
2909                 } else
2910                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2911
2912                 *container = CONTAINER_TERMINATED;
2913                 return status.si_status;
2914
2915         case CLD_KILLED:
2916                 if (status.si_status == SIGINT) {
2917
2918                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2919                         *container = CONTAINER_TERMINATED;
2920                         return 0;
2921
2922                 } else if (status.si_status == SIGHUP) {
2923
2924                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2925                         *container = CONTAINER_REBOOTED;
2926                         return 0;
2927                 }
2928
2929                 /* CLD_KILLED fallthrough */
2930
2931         case CLD_DUMPED:
2932                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2933                 return -EIO;
2934
2935         default:
2936                 log_error("Container %s failed due to unknown reason.", arg_machine);
2937                 return -EIO;
2938         }
2939
2940         return r;
2941 }
2942
2943 static void nop_handler(int sig) {}
2944
2945 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2946         pid_t pid;
2947
2948         pid = PTR_TO_UINT32(userdata);
2949         if (pid > 0) {
2950                 if (kill(pid, SIGRTMIN+3) >= 0) {
2951                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2952                         sd_event_source_set_userdata(s, NULL);
2953                         return 0;
2954                 }
2955         }
2956
2957         sd_event_exit(sd_event_source_get_event(s), 0);
2958         return 0;
2959 }
2960
2961 int main(int argc, char *argv[]) {
2962
2963         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2964         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2965         _cleanup_close_ int master = -1, image_fd = -1;
2966         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2967         _cleanup_fdset_free_ FDSet *fds = NULL;
2968         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2969         const char *console = NULL;
2970         char veth_name[IFNAMSIZ];
2971         bool secondary = false;
2972         sigset_t mask, mask_chld;
2973         pid_t pid = 0;
2974
2975         log_parse_environment();
2976         log_open();
2977
2978         k = parse_argv(argc, argv);
2979         if (k < 0)
2980                 goto finish;
2981         else if (k == 0) {
2982                 r = EXIT_SUCCESS;
2983                 goto finish;
2984         }
2985
2986         if (!arg_image) {
2987                 if (arg_directory) {
2988                         char *p;
2989
2990                         p = path_make_absolute_cwd(arg_directory);
2991                         free(arg_directory);
2992                         arg_directory = p;
2993                 } else
2994                         arg_directory = get_current_dir_name();
2995
2996                 if (!arg_directory) {
2997                         log_error("Failed to determine path, please use -D.");
2998                         goto finish;
2999                 }
3000                 path_kill_slashes(arg_directory);
3001         }
3002
3003         if (!arg_machine) {
3004                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3005                 if (!arg_machine) {
3006                         log_oom();
3007                         goto finish;
3008                 }
3009
3010                 hostname_cleanup(arg_machine, false);
3011                 if (isempty(arg_machine)) {
3012                         log_error("Failed to determine machine name automatically, please use -M.");
3013                         goto finish;
3014                 }
3015         }
3016
3017         if (geteuid() != 0) {
3018                 log_error("Need to be root.");
3019                 goto finish;
3020         }
3021
3022         if (sd_booted() <= 0) {
3023                 log_error("Not running on a systemd system.");
3024                 goto finish;
3025         }
3026
3027         log_close();
3028         n_fd_passed = sd_listen_fds(false);
3029         if (n_fd_passed > 0) {
3030                 k = fdset_new_listen_fds(&fds, false);
3031                 if (k < 0) {
3032                         log_error("Failed to collect file descriptors: %s", strerror(-k));
3033                         goto finish;
3034                 }
3035         }
3036         fdset_close_others(fds);
3037         log_open();
3038
3039         if (arg_directory) {
3040                 if (path_equal(arg_directory, "/")) {
3041                         log_error("Spawning container on root directory not supported.");
3042                         goto finish;
3043                 }
3044
3045                 if (arg_boot) {
3046                         if (path_is_os_tree(arg_directory) <= 0) {
3047                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3048                                 goto finish;
3049                         }
3050                 } else {
3051                         const char *p;
3052
3053                         p = strappenda(arg_directory,
3054                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3055                         if (access(p, F_OK) < 0) {
3056                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3057                                 goto finish;
3058
3059                         }
3060                 }
3061         } else {
3062                 char template[] = "/tmp/nspawn-root-XXXXXX";
3063
3064                 if (!mkdtemp(template)) {
3065                         log_error("Failed to create temporary directory: %m");
3066                         r = -errno;
3067                         goto finish;
3068                 }
3069
3070                 arg_directory = strdup(template);
3071                 if (!arg_directory) {
3072                         r = log_oom();
3073                         goto finish;
3074                 }
3075
3076                 image_fd = setup_image(&device_path, &loop_nr);
3077                 if (image_fd < 0) {
3078                         r = image_fd;
3079                         goto finish;
3080                 }
3081
3082                 r = dissect_image(image_fd,
3083                                   &root_device, &root_device_rw,
3084                                   &home_device, &home_device_rw,
3085                                   &srv_device, &srv_device_rw,
3086                                   &secondary);
3087                 if (r < 0)
3088                         goto finish;
3089         }
3090
3091         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3092         if (master < 0) {
3093                 log_error("Failed to acquire pseudo tty: %m");
3094                 goto finish;
3095         }
3096
3097         console = ptsname(master);
3098         if (!console) {
3099                 log_error("Failed to determine tty name: %m");
3100                 goto finish;
3101         }
3102
3103         if (!arg_quiet)
3104                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3105                          arg_machine, arg_image ? arg_image : arg_directory);
3106
3107         if (unlockpt(master) < 0) {
3108                 log_error("Failed to unlock tty: %m");
3109                 goto finish;
3110         }
3111
3112         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3113                 log_error("Failed to create kmsg socket pair: %m");
3114                 goto finish;
3115         }
3116
3117         sd_notify(false,
3118                   "READY=1\n"
3119                   "STATUS=Container running.");
3120
3121         assert_se(sigemptyset(&mask) == 0);
3122         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3123         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3124
3125         assert_se(sigemptyset(&mask_chld) == 0);
3126         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3127
3128         for (;;) {
3129                 ContainerStatus container_status;
3130                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3131                 struct sigaction sa = {
3132                         .sa_handler = nop_handler,
3133                         .sa_flags = SA_NOCLDSTOP,
3134                 };
3135
3136                 r = barrier_create(&barrier);
3137                 if (r < 0) {
3138                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3139                         goto finish;
3140                 }
3141
3142                 /* Child can be killed before execv(), so handle SIGCHLD
3143                  * in order to interrupt parent's blocking calls and
3144                  * give it a chance to call wait() and terminate. */
3145                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3146                 if (r < 0) {
3147                         log_error("Failed to change the signal mask: %m");
3148                         goto finish;
3149                 }
3150
3151                 r = sigaction(SIGCHLD, &sa, NULL);
3152                 if (r < 0) {
3153                         log_error("Failed to install SIGCHLD handler: %m");
3154                         goto finish;
3155                 }
3156
3157                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3158                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3159                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3160                 if (pid < 0) {
3161                         if (errno == EINVAL)
3162                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3163                         else
3164                                 log_error("clone() failed: %m");
3165
3166                         r = pid;
3167                         goto finish;
3168                 }
3169
3170                 if (pid == 0) {
3171                         /* child */
3172                         _cleanup_free_ char *home = NULL;
3173                         unsigned n_env = 2;
3174                         const char *envp[] = {
3175                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3176                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3177                                 NULL, /* TERM */
3178                                 NULL, /* HOME */
3179                                 NULL, /* USER */
3180                                 NULL, /* LOGNAME */
3181                                 NULL, /* container_uuid */
3182                                 NULL, /* LISTEN_FDS */
3183                                 NULL, /* LISTEN_PID */
3184                                 NULL
3185                         };
3186                         char **env_use;
3187
3188                         barrier_set_role(&barrier, BARRIER_CHILD);
3189
3190                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3191                         if (envp[n_env])
3192                                 n_env ++;
3193
3194                         master = safe_close(master);
3195
3196                         close_nointr(STDIN_FILENO);
3197                         close_nointr(STDOUT_FILENO);
3198                         close_nointr(STDERR_FILENO);
3199
3200                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3201
3202                         reset_all_signal_handlers();
3203                         reset_signal_mask();
3204
3205                         k = open_terminal(console, O_RDWR);
3206                         if (k != STDIN_FILENO) {
3207                                 if (k >= 0) {
3208                                         safe_close(k);
3209                                         k = -EINVAL;
3210                                 }
3211
3212                                 log_error("Failed to open console: %s", strerror(-k));
3213                                 _exit(EXIT_FAILURE);
3214                         }
3215
3216                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3217                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3218                                 log_error("Failed to duplicate console: %m");
3219                                 _exit(EXIT_FAILURE);
3220                         }
3221
3222                         if (setsid() < 0) {
3223                                 log_error("setsid() failed: %m");
3224                                 _exit(EXIT_FAILURE);
3225                         }
3226
3227                         if (reset_audit_loginuid() < 0)
3228                                 _exit(EXIT_FAILURE);
3229
3230                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3231                                 log_error("PR_SET_PDEATHSIG failed: %m");
3232                                 _exit(EXIT_FAILURE);
3233                         }
3234
3235                         /* Mark everything as slave, so that we still
3236                          * receive mounts from the real root, but don't
3237                          * propagate mounts to the real root. */
3238                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3239                                 log_error("MS_SLAVE|MS_REC failed: %m");
3240                                 _exit(EXIT_FAILURE);
3241                         }
3242
3243                         if (mount_devices(arg_directory,
3244                                           root_device, root_device_rw,
3245                                           home_device, home_device_rw,
3246                                           srv_device, srv_device_rw) < 0)
3247                                 _exit(EXIT_FAILURE);
3248
3249                         /* Turn directory into bind mount */
3250                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3251                                 log_error("Failed to make bind mount: %m");
3252                                 _exit(EXIT_FAILURE);
3253                         }
3254
3255                         r = setup_volatile(arg_directory);
3256                         if (r < 0)
3257                                 _exit(EXIT_FAILURE);
3258
3259                         if (setup_volatile_state(arg_directory) < 0)
3260                                 _exit(EXIT_FAILURE);
3261
3262                         r = base_filesystem_create(arg_directory);
3263                         if (r < 0)
3264                                 _exit(EXIT_FAILURE);
3265
3266                         if (arg_read_only) {
3267                                 k = bind_remount_recursive(arg_directory, true);
3268                                 if (k < 0) {
3269                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3270                                         _exit(EXIT_FAILURE);
3271                                 }
3272                         }
3273
3274                         if (mount_all(arg_directory) < 0)
3275                                 _exit(EXIT_FAILURE);
3276
3277                         if (copy_devnodes(arg_directory) < 0)
3278                                 _exit(EXIT_FAILURE);
3279
3280                         if (setup_ptmx(arg_directory) < 0)
3281                                 _exit(EXIT_FAILURE);
3282
3283                         dev_setup(arg_directory);
3284
3285                         if (setup_seccomp() < 0)
3286                                 _exit(EXIT_FAILURE);
3287
3288                         if (setup_dev_console(arg_directory, console) < 0)
3289                                 _exit(EXIT_FAILURE);
3290
3291                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3292                                 _exit(EXIT_FAILURE);
3293
3294                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3295
3296                         if (setup_boot_id(arg_directory) < 0)
3297                                 _exit(EXIT_FAILURE);
3298
3299                         if (setup_timezone(arg_directory) < 0)
3300                                 _exit(EXIT_FAILURE);
3301
3302                         if (setup_resolv_conf(arg_directory) < 0)
3303                                 _exit(EXIT_FAILURE);
3304
3305                         if (setup_journal(arg_directory) < 0)
3306                                 _exit(EXIT_FAILURE);
3307
3308                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3309                                 _exit(EXIT_FAILURE);
3310
3311                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3312                                 _exit(EXIT_FAILURE);
3313
3314                         if (mount_tmpfs(arg_directory) < 0)
3315                                 _exit(EXIT_FAILURE);
3316
3317                         /* Tell the parent that we are ready, and that
3318                          * it can cgroupify us to that we lack access
3319                          * to certain devices and resources. */
3320                         (void)barrier_place(&barrier);
3321
3322                         if (chdir(arg_directory) < 0) {
3323                                 log_error("chdir(%s) failed: %m", arg_directory);
3324                                 _exit(EXIT_FAILURE);
3325                         }
3326
3327                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3328                                 log_error("mount(MS_MOVE) failed: %m");
3329                                 _exit(EXIT_FAILURE);
3330                         }
3331
3332                         if (chroot(".") < 0) {
3333                                 log_error("chroot() failed: %m");
3334                                 _exit(EXIT_FAILURE);
3335                         }
3336
3337                         if (chdir("/") < 0) {
3338                                 log_error("chdir() failed: %m");
3339                                 _exit(EXIT_FAILURE);
3340                         }
3341
3342                         umask(0022);
3343
3344                         if (arg_private_network)
3345                                 loopback_setup();
3346
3347                         if (drop_capabilities() < 0) {
3348                                 log_error("drop_capabilities() failed: %m");
3349                                 _exit(EXIT_FAILURE);
3350                         }
3351
3352                         r = change_uid_gid(&home);
3353                         if (r < 0)
3354                                 _exit(EXIT_FAILURE);
3355
3356                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3357                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3358                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3359                                 log_oom();
3360                                 _exit(EXIT_FAILURE);
3361                         }
3362
3363                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3364                                 char as_uuid[37];
3365
3366                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3367                                         log_oom();
3368                                         _exit(EXIT_FAILURE);
3369                                 }
3370                         }
3371
3372                         if (fdset_size(fds) > 0) {
3373                                 k = fdset_cloexec(fds, false);
3374                                 if (k < 0) {
3375                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3376                                         _exit(EXIT_FAILURE);
3377                                 }
3378
3379                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3380                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3381                                         log_oom();
3382                                         _exit(EXIT_FAILURE);
3383                                 }
3384                         }
3385
3386                         setup_hostname();
3387
3388                         if (arg_personality != 0xffffffffLU) {
3389                                 if (personality(arg_personality) < 0) {
3390                                         log_error("personality() failed: %m");
3391                                         _exit(EXIT_FAILURE);
3392                                 }
3393                         } else if (secondary) {
3394                                 if (personality(PER_LINUX32) < 0) {
3395                                         log_error("personality() failed: %m");
3396                                         _exit(EXIT_FAILURE);
3397                                 }
3398                         }
3399
3400 #ifdef HAVE_SELINUX
3401                         if (arg_selinux_context)
3402                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3403                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3404                                         _exit(EXIT_FAILURE);
3405                                 }
3406 #endif
3407
3408                         if (!strv_isempty(arg_setenv)) {
3409                                 char **n;
3410
3411                                 n = strv_env_merge(2, envp, arg_setenv);
3412                                 if (!n) {
3413                                         log_oom();
3414                                         _exit(EXIT_FAILURE);
3415                                 }
3416
3417                                 env_use = n;
3418                         } else
3419                                 env_use = (char**) envp;
3420
3421                         /* Wait until the parent is ready with the setup, too... */
3422                         if (!barrier_place_and_sync(&barrier))
3423                                 _exit(EXIT_FAILURE);
3424
3425                         if (arg_boot) {
3426                                 char **a;
3427                                 size_t l;
3428
3429                                 /* Automatically search for the init system */
3430
3431                                 l = 1 + argc - optind;
3432                                 a = newa(char*, l + 1);
3433                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3434
3435                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3436                                 execve(a[0], a, env_use);
3437
3438                                 a[0] = (char*) "/lib/systemd/systemd";
3439                                 execve(a[0], a, env_use);
3440
3441                                 a[0] = (char*) "/sbin/init";
3442                                 execve(a[0], a, env_use);
3443                         } else if (argc > optind)
3444                                 execvpe(argv[optind], argv + optind, env_use);
3445                         else {
3446                                 chdir(home ? home : "/root");
3447                                 execle("/bin/bash", "-bash", NULL, env_use);
3448                                 execle("/bin/sh", "-sh", NULL, env_use);
3449                         }
3450
3451                         log_error("execv() failed: %m");
3452                         _exit(EXIT_FAILURE);
3453                 }
3454
3455                 barrier_set_role(&barrier, BARRIER_PARENT);
3456                 fdset_free(fds);
3457                 fds = NULL;
3458
3459                 /* wait for child-setup to be done */
3460                 if (barrier_place_and_sync(&barrier)) {
3461                         _cleanup_event_unref_ sd_event *event = NULL;
3462                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3463                         int ifi = 0;
3464
3465                         r = move_network_interfaces(pid);
3466                         if (r < 0)
3467                                 goto finish;
3468
3469                         r = setup_veth(pid, veth_name, &ifi);
3470                         if (r < 0)
3471                                 goto finish;
3472
3473                         r = setup_bridge(veth_name, &ifi);
3474                         if (r < 0)
3475                                 goto finish;
3476
3477                         r = setup_macvlan(pid);
3478                         if (r < 0)
3479                                 goto finish;
3480
3481                         r = register_machine(pid, ifi);
3482                         if (r < 0)
3483                                 goto finish;
3484
3485                         /* Block SIGCHLD here, before notifying child.
3486                          * process_pty() will handle it with the other signals. */
3487                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3488                         if (r < 0)
3489                                 goto finish;
3490
3491                         /* Reset signal to default */
3492                         r = default_signals(SIGCHLD, -1);
3493                         if (r < 0)
3494                                 goto finish;
3495
3496                         /* Notify the child that the parent is ready with all
3497                          * its setup, and that the child can now hand over
3498                          * control to the code to run inside the container. */
3499                         (void)barrier_place(&barrier);
3500
3501                         r = sd_event_new(&event);
3502                         if (r < 0) {
3503                                 log_error("Failed to get default event source: %s", strerror(-r));
3504                                 goto finish;
3505                         }
3506
3507                         if (arg_boot) {
3508                                 /* Try to kill the init system on SIGINT or SIGTERM */
3509                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3510                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3511                         } else {
3512                                 /* Immediately exit */
3513                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3514                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3515                         }
3516
3517                         /* simply exit on sigchld */
3518                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3519
3520                         r = pty_forward_new(event, master, &forward);
3521                         if (r < 0) {
3522                                 log_error("Failed to create PTY forwarder: %s", strerror(-r));
3523                                 goto finish;
3524                         }
3525
3526                         r = sd_event_loop(event);
3527                         if (r < 0) {
3528                                 log_error("Failed to run event loop: %s", strerror(-r));
3529                                 return r;
3530                         }
3531
3532                         forward = pty_forward_free(forward);
3533
3534                         if (!arg_quiet)
3535                                 putc('\n', stdout);
3536
3537                         /* Kill if it is not dead yet anyway */
3538                         terminate_machine(pid);
3539                 }
3540
3541                 /* Normally redundant, but better safe than sorry */
3542                 kill(pid, SIGKILL);
3543
3544                 r = wait_for_container(pid, &container_status);
3545                 pid = 0;
3546
3547                 if (r < 0) {
3548                         /* We failed to wait for the container, or the
3549                          * container exited abnormally */
3550                         r = EXIT_FAILURE;
3551                         break;
3552                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3553                         /* The container exited with a non-zero
3554                          * status, or with zero status and no reboot
3555                          * was requested. */
3556                         break;
3557
3558                 /* CONTAINER_REBOOTED, loop again */
3559
3560                 if (arg_keep_unit) {
3561                         /* Special handling if we are running as a
3562                          * service: instead of simply restarting the
3563                          * machine we want to restart the entire
3564                          * service, so let's inform systemd about this
3565                          * with the special exit code 133. The service
3566                          * file uses RestartForceExitStatus=133 so
3567                          * that this results in a full nspawn
3568                          * restart. This is necessary since we might
3569                          * have cgroup parameters set we want to have
3570                          * flushed out. */
3571                         r = 133;
3572                         break;
3573                 }
3574         }
3575
3576 finish:
3577         sd_notify(false,
3578                   "STOPPING=1\n"
3579                   "STATUS=Terminating...");
3580
3581         loop_remove(loop_nr, &image_fd);
3582
3583         if (pid > 0)
3584                 kill(pid, SIGKILL);
3585
3586         free(arg_directory);
3587         free(arg_machine);
3588         free(arg_user);
3589         strv_free(arg_setenv);
3590         strv_free(arg_network_interfaces);
3591         strv_free(arg_network_macvlan);
3592         strv_free(arg_bind);
3593         strv_free(arg_bind_ro);
3594         strv_free(arg_tmpfs);
3595
3596         return r;
3597 }