chiark / gitweb /
treewide: auto-convert the simple cases to log_*_errno()
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static bool arg_link_journal_try = false;
128 static uint64_t arg_retain =
129         (1ULL << CAP_CHOWN) |
130         (1ULL << CAP_DAC_OVERRIDE) |
131         (1ULL << CAP_DAC_READ_SEARCH) |
132         (1ULL << CAP_FOWNER) |
133         (1ULL << CAP_FSETID) |
134         (1ULL << CAP_IPC_OWNER) |
135         (1ULL << CAP_KILL) |
136         (1ULL << CAP_LEASE) |
137         (1ULL << CAP_LINUX_IMMUTABLE) |
138         (1ULL << CAP_NET_BIND_SERVICE) |
139         (1ULL << CAP_NET_BROADCAST) |
140         (1ULL << CAP_NET_RAW) |
141         (1ULL << CAP_SETGID) |
142         (1ULL << CAP_SETFCAP) |
143         (1ULL << CAP_SETPCAP) |
144         (1ULL << CAP_SETUID) |
145         (1ULL << CAP_SYS_ADMIN) |
146         (1ULL << CAP_SYS_CHROOT) |
147         (1ULL << CAP_SYS_NICE) |
148         (1ULL << CAP_SYS_PTRACE) |
149         (1ULL << CAP_SYS_TTY_CONFIG) |
150         (1ULL << CAP_SYS_RESOURCE) |
151         (1ULL << CAP_SYS_BOOT) |
152         (1ULL << CAP_AUDIT_WRITE) |
153         (1ULL << CAP_AUDIT_CONTROL) |
154         (1ULL << CAP_MKNOD);
155 static char **arg_bind = NULL;
156 static char **arg_bind_ro = NULL;
157 static char **arg_tmpfs = NULL;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static bool arg_network_veth = false;
166 static const char *arg_network_bridge = NULL;
167 static unsigned long arg_personality = 0xffffffffLU;
168 static const char *arg_image = NULL;
169 static Volatile arg_volatile = VOLATILE_NO;
170
171 static void help(void) {
172         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174                "  -h --help                 Show this help\n"
175                "     --version              Print version string\n"
176                "  -q --quiet                Do not show status information\n"
177                "  -D --directory=PATH       Root directory for the container\n"
178                "  -i --image=PATH           File system device or image for the container\n"
179                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
180                "  -u --user=USER            Run the command under specified user or uid\n"
181                "  -M --machine=NAME         Set the machine name for the container\n"
182                "     --uuid=UUID            Set a specific machine UUID for the container\n"
183                "  -S --slice=SLICE          Place the container in the specified slice\n"
184                "     --private-network      Disable network in container\n"
185                "     --network-interface=INTERFACE\n"
186                "                            Assign an existing network interface to the\n"
187                "                            container\n"
188                "     --network-macvlan=INTERFACE\n"
189                "                            Create a macvlan network interface based on an\n"
190                "                            existing network interface to the container\n"
191                "     --network-veth         Add a virtual ethernet connection between host\n"
192                "                            and container\n"
193                "     --network-bridge=INTERFACE\n"
194                "                            Add a virtual ethernet connection between host\n"
195                "                            and container and add it to an existing bridge on\n"
196                "                            the host\n"
197                "  -Z --selinux-context=SECLABEL\n"
198                "                            Set the SELinux security context to be used by\n"
199                "                            processes in the container\n"
200                "  -L --selinux-apifs-context=SECLABEL\n"
201                "                            Set the SELinux security context to be used by\n"
202                "                            API/tmpfs file systems in the container\n"
203                "     --capability=CAP       In addition to the default, retain specified\n"
204                "                            capability\n"
205                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
206                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
207                "                            try-guest, try-host\n"
208                "  -j                        Equivalent to --link-journal=try-guest\n"
209                "     --read-only            Mount the root directory read-only\n"
210                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
211                "                            the container\n"
212                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
213                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
214                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
215                "     --share-system         Share system namespaces with host\n"
216                "     --register=BOOLEAN     Register container as machine\n"
217                "     --keep-unit            Do not register a scope for the machine, reuse\n"
218                "                            the service unit nspawn is running in\n"
219                "     --volatile[=MODE]      Run the system in volatile mode\n",
220                program_invocation_short_name);
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225         enum {
226                 ARG_VERSION = 0x100,
227                 ARG_PRIVATE_NETWORK,
228                 ARG_UUID,
229                 ARG_READ_ONLY,
230                 ARG_CAPABILITY,
231                 ARG_DROP_CAPABILITY,
232                 ARG_LINK_JOURNAL,
233                 ARG_BIND,
234                 ARG_BIND_RO,
235                 ARG_TMPFS,
236                 ARG_SETENV,
237                 ARG_SHARE_SYSTEM,
238                 ARG_REGISTER,
239                 ARG_KEEP_UNIT,
240                 ARG_NETWORK_INTERFACE,
241                 ARG_NETWORK_MACVLAN,
242                 ARG_NETWORK_VETH,
243                 ARG_NETWORK_BRIDGE,
244                 ARG_PERSONALITY,
245                 ARG_VOLATILE,
246         };
247
248         static const struct option options[] = {
249                 { "help",                  no_argument,       NULL, 'h'                   },
250                 { "version",               no_argument,       NULL, ARG_VERSION           },
251                 { "directory",             required_argument, NULL, 'D'                   },
252                 { "user",                  required_argument, NULL, 'u'                   },
253                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
254                 { "boot",                  no_argument,       NULL, 'b'                   },
255                 { "uuid",                  required_argument, NULL, ARG_UUID              },
256                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
257                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
258                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
259                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
260                 { "bind",                  required_argument, NULL, ARG_BIND              },
261                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
262                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
263                 { "machine",               required_argument, NULL, 'M'                   },
264                 { "slice",                 required_argument, NULL, 'S'                   },
265                 { "setenv",                required_argument, NULL, ARG_SETENV            },
266                 { "selinux-context",       required_argument, NULL, 'Z'                   },
267                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
268                 { "quiet",                 no_argument,       NULL, 'q'                   },
269                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
270                 { "register",              required_argument, NULL, ARG_REGISTER          },
271                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
272                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
273                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
274                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
275                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
276                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
277                 { "image",                 required_argument, NULL, 'i'                   },
278                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
279                 {}
280         };
281
282         int c, r;
283         uint64_t plus = 0, minus = 0;
284
285         assert(argc >= 0);
286         assert(argv);
287
288         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
289
290                 switch (c) {
291
292                 case 'h':
293                         help();
294                         return 0;
295
296                 case ARG_VERSION:
297                         puts(PACKAGE_STRING);
298                         puts(SYSTEMD_FEATURES);
299                         return 0;
300
301                 case 'D':
302                         free(arg_directory);
303                         arg_directory = canonicalize_file_name(optarg);
304                         if (!arg_directory) {
305                                 log_error("Invalid root directory: %m");
306                                 return -ENOMEM;
307                         }
308
309                         break;
310
311                 case 'i':
312                         arg_image = optarg;
313                         break;
314
315                 case 'u':
316                         free(arg_user);
317                         arg_user = strdup(optarg);
318                         if (!arg_user)
319                                 return log_oom();
320
321                         break;
322
323                 case ARG_NETWORK_BRIDGE:
324                         arg_network_bridge = optarg;
325
326                         /* fall through */
327
328                 case ARG_NETWORK_VETH:
329                         arg_network_veth = true;
330                         arg_private_network = true;
331                         break;
332
333                 case ARG_NETWORK_INTERFACE:
334                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
335                                 return log_oom();
336
337                         arg_private_network = true;
338                         break;
339
340                 case ARG_NETWORK_MACVLAN:
341                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
342                                 return log_oom();
343
344                         /* fall through */
345
346                 case ARG_PRIVATE_NETWORK:
347                         arg_private_network = true;
348                         break;
349
350                 case 'b':
351                         arg_boot = true;
352                         break;
353
354                 case ARG_UUID:
355                         r = sd_id128_from_string(optarg, &arg_uuid);
356                         if (r < 0) {
357                                 log_error("Invalid UUID: %s", optarg);
358                                 return r;
359                         }
360                         break;
361
362                 case 'S':
363                         arg_slice = optarg;
364                         break;
365
366                 case 'M':
367                         if (isempty(optarg)) {
368                                 free(arg_machine);
369                                 arg_machine = NULL;
370                         } else {
371
372                                 if (!hostname_is_valid(optarg)) {
373                                         log_error("Invalid machine name: %s", optarg);
374                                         return -EINVAL;
375                                 }
376
377                                 free(arg_machine);
378                                 arg_machine = strdup(optarg);
379                                 if (!arg_machine)
380                                         return log_oom();
381
382                                 break;
383                         }
384
385                 case 'Z':
386                         arg_selinux_context = optarg;
387                         break;
388
389                 case 'L':
390                         arg_selinux_apifs_context = optarg;
391                         break;
392
393                 case ARG_READ_ONLY:
394                         arg_read_only = true;
395                         break;
396
397                 case ARG_CAPABILITY:
398                 case ARG_DROP_CAPABILITY: {
399                         const char *state, *word;
400                         size_t length;
401
402                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403                                 _cleanup_free_ char *t;
404                                 cap_value_t cap;
405
406                                 t = strndup(word, length);
407                                 if (!t)
408                                         return log_oom();
409
410                                 if (streq(t, "all")) {
411                                         if (c == ARG_CAPABILITY)
412                                                 plus = (uint64_t) -1;
413                                         else
414                                                 minus = (uint64_t) -1;
415                                 } else {
416                                         if (cap_from_name(t, &cap) < 0) {
417                                                 log_error("Failed to parse capability %s.", t);
418                                                 return -EINVAL;
419                                         }
420
421                                         if (c == ARG_CAPABILITY)
422                                                 plus |= 1ULL << (uint64_t) cap;
423                                         else
424                                                 minus |= 1ULL << (uint64_t) cap;
425                                 }
426                         }
427
428                         break;
429                 }
430
431                 case 'j':
432                         arg_link_journal = LINK_GUEST;
433                         arg_link_journal_try = true;
434                         break;
435
436                 case ARG_LINK_JOURNAL:
437                         if (streq(optarg, "auto"))
438                                 arg_link_journal = LINK_AUTO;
439                         else if (streq(optarg, "no"))
440                                 arg_link_journal = LINK_NO;
441                         else if (streq(optarg, "guest"))
442                                 arg_link_journal = LINK_GUEST;
443                         else if (streq(optarg, "host"))
444                                 arg_link_journal = LINK_HOST;
445                         else if (streq(optarg, "try-guest")) {
446                                 arg_link_journal = LINK_GUEST;
447                                 arg_link_journal_try = true;
448                         } else if (streq(optarg, "try-host")) {
449                                 arg_link_journal = LINK_HOST;
450                                 arg_link_journal_try = true;
451                         } else {
452                                 log_error("Failed to parse link journal mode %s", optarg);
453                                 return -EINVAL;
454                         }
455
456                         break;
457
458                 case ARG_BIND:
459                 case ARG_BIND_RO: {
460                         _cleanup_free_ char *a = NULL, *b = NULL;
461                         char *e;
462                         char ***x;
463
464                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466                         e = strchr(optarg, ':');
467                         if (e) {
468                                 a = strndup(optarg, e - optarg);
469                                 b = strdup(e + 1);
470                         } else {
471                                 a = strdup(optarg);
472                                 b = strdup(optarg);
473                         }
474
475                         if (!a || !b)
476                                 return log_oom();
477
478                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
479                                 log_error("Invalid bind mount specification: %s", optarg);
480                                 return -EINVAL;
481                         }
482
483                         r = strv_extend(x, a);
484                         if (r < 0)
485                                 return log_oom();
486
487                         r = strv_extend(x, b);
488                         if (r < 0)
489                                 return log_oom();
490
491                         break;
492                 }
493
494                 case ARG_TMPFS: {
495                         _cleanup_free_ char *a = NULL, *b = NULL;
496                         char *e;
497
498                         e = strchr(optarg, ':');
499                         if (e) {
500                                 a = strndup(optarg, e - optarg);
501                                 b = strdup(e + 1);
502                         } else {
503                                 a = strdup(optarg);
504                                 b = strdup("mode=0755");
505                         }
506
507                         if (!a || !b)
508                                 return log_oom();
509
510                         if (!path_is_absolute(a)) {
511                                 log_error("Invalid tmpfs specification: %s", optarg);
512                                 return -EINVAL;
513                         }
514
515                         r = strv_push(&arg_tmpfs, a);
516                         if (r < 0)
517                                 return log_oom();
518
519                         a = NULL;
520
521                         r = strv_push(&arg_tmpfs, b);
522                         if (r < 0)
523                                 return log_oom();
524
525                         b = NULL;
526
527                         break;
528                 }
529
530                 case ARG_SETENV: {
531                         char **n;
532
533                         if (!env_assignment_is_valid(optarg)) {
534                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
535                                 return -EINVAL;
536                         }
537
538                         n = strv_env_set(arg_setenv, optarg);
539                         if (!n)
540                                 return log_oom();
541
542                         strv_free(arg_setenv);
543                         arg_setenv = n;
544                         break;
545                 }
546
547                 case 'q':
548                         arg_quiet = true;
549                         break;
550
551                 case ARG_SHARE_SYSTEM:
552                         arg_share_system = true;
553                         break;
554
555                 case ARG_REGISTER:
556                         r = parse_boolean(optarg);
557                         if (r < 0) {
558                                 log_error("Failed to parse --register= argument: %s", optarg);
559                                 return r;
560                         }
561
562                         arg_register = r;
563                         break;
564
565                 case ARG_KEEP_UNIT:
566                         arg_keep_unit = true;
567                         break;
568
569                 case ARG_PERSONALITY:
570
571                         arg_personality = personality_from_string(optarg);
572                         if (arg_personality == 0xffffffffLU) {
573                                 log_error("Unknown or unsupported personality '%s'.", optarg);
574                                 return -EINVAL;
575                         }
576
577                         break;
578
579                 case ARG_VOLATILE:
580
581                         if (!optarg)
582                                 arg_volatile = VOLATILE_YES;
583                         else {
584                                 r = parse_boolean(optarg);
585                                 if (r < 0) {
586                                         if (streq(optarg, "state"))
587                                                 arg_volatile = VOLATILE_STATE;
588                                         else {
589                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
590                                                 return r;
591                                         }
592                                 } else
593                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594                         }
595
596                         break;
597
598                 case '?':
599                         return -EINVAL;
600
601                 default:
602                         assert_not_reached("Unhandled option");
603                 }
604
605         if (arg_share_system)
606                 arg_register = false;
607
608         if (arg_boot && arg_share_system) {
609                 log_error("--boot and --share-system may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614                 log_error("--keep-unit may not be used when invoked from a user session.");
615                 return -EINVAL;
616         }
617
618         if (arg_directory && arg_image) {
619                 log_error("--directory= and --image= may not be combined.");
620                 return -EINVAL;
621         }
622
623         if (arg_volatile != VOLATILE_NO && arg_read_only) {
624                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625                 return -EINVAL;
626         }
627
628         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
630         return 1;
631 }
632
633 static int mount_all(const char *dest) {
634
635         typedef struct MountPoint {
636                 const char *what;
637                 const char *where;
638                 const char *type;
639                 const char *options;
640                 unsigned long flags;
641                 bool fatal;
642         } MountPoint;
643
644         static const MountPoint mount_table[] = {
645                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
646                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
647                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
648                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
649                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
650                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
651                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
652                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
653 #ifdef HAVE_SELINUX
654                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
655                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
656 #endif
657         };
658
659         unsigned k;
660         int r = 0;
661
662         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
663                 _cleanup_free_ char *where = NULL;
664 #ifdef HAVE_SELINUX
665                 _cleanup_free_ char *options = NULL;
666 #endif
667                 const char *o;
668                 int t;
669
670                 where = strjoin(dest, "/", mount_table[k].where, NULL);
671                 if (!where)
672                         return log_oom();
673
674                 t = path_is_mount_point(where, true);
675                 if (t < 0) {
676                         log_error_errno(-t, "Failed to detect whether %s is a mount point: %m", where);
677
678                         if (r == 0)
679                                 r = t;
680
681                         continue;
682                 }
683
684                 /* Skip this entry if it is not a remount. */
685                 if (mount_table[k].what && t > 0)
686                         continue;
687
688                 t = mkdir_p(where, 0755);
689                 if (t < 0) {
690                         if (mount_table[k].fatal) {
691                                log_error_errno(-t, "Failed to create directory %s: %m", where);
692
693                                 if (r == 0)
694                                         r = t;
695                         } else
696                                log_warning_errno(-t, "Failed to create directory %s: %m", where);
697
698                         continue;
699                 }
700
701 #ifdef HAVE_SELINUX
702                 if (arg_selinux_apifs_context &&
703                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
705                         if (!options)
706                                 return log_oom();
707
708                         o = options;
709                 } else
710 #endif
711                         o = mount_table[k].options;
712
713
714                 if (mount(mount_table[k].what,
715                           where,
716                           mount_table[k].type,
717                           mount_table[k].flags,
718                           o) < 0) {
719
720                         if (mount_table[k].fatal) {
721                                 log_error("mount(%s) failed: %m", where);
722
723                                 if (r == 0)
724                                         r = -errno;
725                         } else
726                                 log_warning("mount(%s) failed: %m", where);
727                 }
728         }
729
730         return r;
731 }
732
733 static int mount_binds(const char *dest, char **l, bool ro) {
734         char **x, **y;
735
736         STRV_FOREACH_PAIR(x, y, l) {
737                 _cleanup_free_ char *where = NULL;
738                 struct stat source_st, dest_st;
739                 int r;
740
741                 if (stat(*x, &source_st) < 0) {
742                         log_error("Failed to stat %s: %m", *x);
743                         return -errno;
744                 }
745
746                 where = strappend(dest, *y);
747                 if (!where)
748                         return log_oom();
749
750                 r = stat(where, &dest_st);
751                 if (r == 0) {
752                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
753                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
754                                 return -EINVAL;
755                         }
756                 } else if (errno == ENOENT) {
757                         r = mkdir_parents_label(where, 0755);
758                         if (r < 0) {
759                                 log_error_errno(-r, "Failed to bind mount %s: %m", *x);
760                                 return r;
761                         }
762                 } else {
763                         log_error("Failed to bind mount %s: %m", *x);
764                         return -errno;
765                 }
766
767                 /* Create the mount point, but be conservative -- refuse to create block
768                  * and char devices. */
769                 if (S_ISDIR(source_st.st_mode)) {
770                         r = mkdir_label(where, 0755);
771                         if (r < 0 && errno != EEXIST) {
772                                 log_error_errno(-r, "Failed to create mount point %s: %m", where);
773
774                                 return r;
775                         }
776                 } else if (S_ISFIFO(source_st.st_mode)) {
777                         r = mkfifo(where, 0644);
778                         if (r < 0 && errno != EEXIST) {
779                                 log_error("Failed to create mount point %s: %m", where);
780
781                                 return -errno;
782                         }
783                 } else if (S_ISSOCK(source_st.st_mode)) {
784                         r = mknod(where, 0644 | S_IFSOCK, 0);
785                         if (r < 0 && errno != EEXIST) {
786                                 log_error("Failed to create mount point %s: %m", where);
787
788                                 return -errno;
789                         }
790                 } else if (S_ISREG(source_st.st_mode)) {
791                         r = touch(where);
792                         if (r < 0) {
793                                 log_error_errno(-r, "Failed to create mount point %s: %m", where);
794
795                                 return r;
796                         }
797                 } else {
798                         log_error("Refusing to create mountpoint for file: %s", *x);
799                         return -ENOTSUP;
800                 }
801
802                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
803                         log_error("mount(%s) failed: %m", where);
804                         return -errno;
805                 }
806
807                 if (ro) {
808                         r = bind_remount_recursive(where, true);
809                         if (r < 0) {
810                                 log_error_errno(-r, "Read-Only bind mount failed: %m");
811                                 return r;
812                         }
813                 }
814         }
815
816         return 0;
817 }
818
819 static int mount_tmpfs(const char *dest) {
820         char **i, **o;
821
822         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
823                 _cleanup_free_ char *where = NULL;
824                 int r;
825
826                 where = strappend(dest, *i);
827                 if (!where)
828                         return log_oom();
829
830                 r = mkdir_label(where, 0755);
831                 if (r < 0 && errno != EEXIST) {
832                         log_error_errno(-r, "creating mount point for tmpfs %s failed: %m", where);
833
834                         return r;
835                 }
836
837                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
838                         log_error("tmpfs mount to %s failed: %m", where);
839                         return -errno;
840                 }
841         }
842
843         return 0;
844 }
845
846 static int setup_timezone(const char *dest) {
847         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
848         char *z, *y;
849         int r;
850
851         assert(dest);
852
853         /* Fix the timezone, if possible */
854         r = readlink_malloc("/etc/localtime", &p);
855         if (r < 0) {
856                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
857                 return 0;
858         }
859
860         z = path_startswith(p, "../usr/share/zoneinfo/");
861         if (!z)
862                 z = path_startswith(p, "/usr/share/zoneinfo/");
863         if (!z) {
864                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
865                 return 0;
866         }
867
868         where = strappend(dest, "/etc/localtime");
869         if (!where)
870                 return log_oom();
871
872         r = readlink_malloc(where, &q);
873         if (r >= 0) {
874                 y = path_startswith(q, "../usr/share/zoneinfo/");
875                 if (!y)
876                         y = path_startswith(q, "/usr/share/zoneinfo/");
877
878                 /* Already pointing to the right place? Then do nothing .. */
879                 if (y && streq(y, z))
880                         return 0;
881         }
882
883         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
884         if (!check)
885                 return log_oom();
886
887         if (access(check, F_OK) < 0) {
888                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
889                 return 0;
890         }
891
892         what = strappend("../usr/share/zoneinfo/", z);
893         if (!what)
894                 return log_oom();
895
896         r = mkdir_parents(where, 0755);
897         if (r < 0) {
898                 log_error_errno(-r, "Failed to create directory for timezone info %s in container: %m", where);
899
900                 return 0;
901         }
902
903         r = unlink(where);
904         if (r < 0 && errno != ENOENT) {
905                 log_error("Failed to remove existing timezone info %s in container: %m", where);
906
907                 return 0;
908         }
909
910         if (symlink(what, where) < 0) {
911                 log_error("Failed to correct timezone of container: %m");
912                 return 0;
913         }
914
915         return 0;
916 }
917
918 static int setup_resolv_conf(const char *dest) {
919         _cleanup_free_ char *where = NULL;
920         int r;
921
922         assert(dest);
923
924         if (arg_private_network)
925                 return 0;
926
927         /* Fix resolv.conf, if possible */
928         where = strappend(dest, "/etc/resolv.conf");
929         if (!where)
930                 return log_oom();
931
932         /* We don't really care for the results of this really. If it
933          * fails, it fails, but meh... */
934         r = mkdir_parents(where, 0755);
935         if (r < 0) {
936                 log_warning_errno(-r, "Failed to create parent directory for resolv.conf %s: %m", where);
937
938                 return 0;
939         }
940
941         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
942         if (r < 0) {
943                 log_warning_errno(-r, "Failed to copy /etc/resolv.conf to %s: %m", where);
944
945                 return 0;
946         }
947
948         return 0;
949 }
950
951 static int setup_volatile_state(const char *directory) {
952         const char *p;
953         int r;
954
955         assert(directory);
956
957         if (arg_volatile != VOLATILE_STATE)
958                 return 0;
959
960         /* --volatile=state means we simply overmount /var
961            with a tmpfs, and the rest read-only. */
962
963         r = bind_remount_recursive(directory, true);
964         if (r < 0) {
965                 log_error_errno(-r, "Failed to remount %s read-only: %m", directory);
966                 return r;
967         }
968
969         p = strappenda(directory, "/var");
970         r = mkdir(p, 0755);
971         if (r < 0 && errno != EEXIST) {
972                 log_error("Failed to create %s: %m", directory);
973                 return -errno;
974         }
975
976         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
977                 log_error("Failed to mount tmpfs to /var: %m");
978                 return -errno;
979         }
980
981         return 0;
982 }
983
984 static int setup_volatile(const char *directory) {
985         bool tmpfs_mounted = false, bind_mounted = false;
986         char template[] = "/tmp/nspawn-volatile-XXXXXX";
987         const char *f, *t;
988         int r;
989
990         assert(directory);
991
992         if (arg_volatile != VOLATILE_YES)
993                 return 0;
994
995         /* --volatile=yes means we mount a tmpfs to the root dir, and
996            the original /usr to use inside it, and that read-only. */
997
998         if (!mkdtemp(template)) {
999                 log_error("Failed to create temporary directory: %m");
1000                 return -errno;
1001         }
1002
1003         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1004                 log_error("Failed to mount tmpfs for root directory: %m");
1005                 r = -errno;
1006                 goto fail;
1007         }
1008
1009         tmpfs_mounted = true;
1010
1011         f = strappenda(directory, "/usr");
1012         t = strappenda(template, "/usr");
1013
1014         r = mkdir(t, 0755);
1015         if (r < 0 && errno != EEXIST) {
1016                 log_error("Failed to create %s: %m", t);
1017                 r = -errno;
1018                 goto fail;
1019         }
1020
1021         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1022                 log_error("Failed to create /usr bind mount: %m");
1023                 r = -errno;
1024                 goto fail;
1025         }
1026
1027         bind_mounted = true;
1028
1029         r = bind_remount_recursive(t, true);
1030         if (r < 0) {
1031                 log_error_errno(-r, "Failed to remount %s read-only: %m", t);
1032                 goto fail;
1033         }
1034
1035         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1036                 log_error("Failed to move root mount: %m");
1037                 r = -errno;
1038                 goto fail;
1039         }
1040
1041         rmdir(template);
1042
1043         return 0;
1044
1045 fail:
1046         if (bind_mounted)
1047                 umount(t);
1048         if (tmpfs_mounted)
1049                 umount(template);
1050         rmdir(template);
1051         return r;
1052 }
1053
1054 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1055
1056         snprintf(s, 37,
1057                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1058                  SD_ID128_FORMAT_VAL(id));
1059
1060         return s;
1061 }
1062
1063 static int setup_boot_id(const char *dest) {
1064         _cleanup_free_ char *from = NULL, *to = NULL;
1065         sd_id128_t rnd = {};
1066         char as_uuid[37];
1067         int r;
1068
1069         assert(dest);
1070
1071         if (arg_share_system)
1072                 return 0;
1073
1074         /* Generate a new randomized boot ID, so that each boot-up of
1075          * the container gets a new one */
1076
1077         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1078         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1079         if (!from || !to)
1080                 return log_oom();
1081
1082         r = sd_id128_randomize(&rnd);
1083         if (r < 0) {
1084                 log_error_errno(-r, "Failed to generate random boot id: %m");
1085                 return r;
1086         }
1087
1088         id128_format_as_uuid(rnd, as_uuid);
1089
1090         r = write_string_file(from, as_uuid);
1091         if (r < 0) {
1092                 log_error_errno(-r, "Failed to write boot id: %m");
1093                 return r;
1094         }
1095
1096         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1097                 log_error("Failed to bind mount boot id: %m");
1098                 r = -errno;
1099         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1100                 log_warning("Failed to make boot id read-only: %m");
1101
1102         unlink(from);
1103         return r;
1104 }
1105
1106 static int copy_devnodes(const char *dest) {
1107
1108         static const char devnodes[] =
1109                 "null\0"
1110                 "zero\0"
1111                 "full\0"
1112                 "random\0"
1113                 "urandom\0"
1114                 "tty\0"
1115                 "net/tun\0";
1116
1117         const char *d;
1118         int r = 0;
1119         _cleanup_umask_ mode_t u;
1120
1121         assert(dest);
1122
1123         u = umask(0000);
1124
1125         NULSTR_FOREACH(d, devnodes) {
1126                 _cleanup_free_ char *from = NULL, *to = NULL;
1127                 struct stat st;
1128
1129                 from = strappend("/dev/", d);
1130                 to = strjoin(dest, "/dev/", d, NULL);
1131                 if (!from || !to)
1132                         return log_oom();
1133
1134                 if (stat(from, &st) < 0) {
1135
1136                         if (errno != ENOENT) {
1137                                 log_error("Failed to stat %s: %m", from);
1138                                 return -errno;
1139                         }
1140
1141                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1142
1143                         log_error("%s is not a char or block device, cannot copy", from);
1144                         return -EIO;
1145
1146                 } else {
1147                         r = mkdir_parents(to, 0775);
1148                         if (r < 0) {
1149                                 log_error_errno(-r, "Failed to create parent directory of %s: %m", to);
1150                                 return -r;
1151                         }
1152
1153                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1154                                 log_error("mknod(%s) failed: %m", dest);
1155                                 return  -errno;
1156                         }
1157                 }
1158         }
1159
1160         return r;
1161 }
1162
1163 static int setup_ptmx(const char *dest) {
1164         _cleanup_free_ char *p = NULL;
1165
1166         p = strappend(dest, "/dev/ptmx");
1167         if (!p)
1168                 return log_oom();
1169
1170         if (symlink("pts/ptmx", p) < 0) {
1171                 log_error("Failed to create /dev/ptmx symlink: %m");
1172                 return -errno;
1173         }
1174
1175         return 0;
1176 }
1177
1178 static int setup_dev_console(const char *dest, const char *console) {
1179         _cleanup_umask_ mode_t u;
1180         const char *to;
1181         struct stat st;
1182         int r;
1183
1184         assert(dest);
1185         assert(console);
1186
1187         u = umask(0000);
1188
1189         if (stat("/dev/null", &st) < 0) {
1190                 log_error("Failed to stat /dev/null: %m");
1191                 return -errno;
1192         }
1193
1194         r = chmod_and_chown(console, 0600, 0, 0);
1195         if (r < 0) {
1196                 log_error_errno(-r, "Failed to correct access mode for TTY: %m");
1197                 return r;
1198         }
1199
1200         /* We need to bind mount the right tty to /dev/console since
1201          * ptys can only exist on pts file systems. To have something
1202          * to bind mount things on we create a device node first, and
1203          * use /dev/null for that since we the cgroups device policy
1204          * allows us to create that freely, while we cannot create
1205          * /dev/console. (Note that the major minor doesn't actually
1206          * matter here, since we mount it over anyway). */
1207
1208         to = strappenda(dest, "/dev/console");
1209         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1210                 log_error("mknod() for /dev/console failed: %m");
1211                 return -errno;
1212         }
1213
1214         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1215                 log_error("Bind mount for /dev/console failed: %m");
1216                 return -errno;
1217         }
1218
1219         return 0;
1220 }
1221
1222 static int setup_kmsg(const char *dest, int kmsg_socket) {
1223         _cleanup_free_ char *from = NULL, *to = NULL;
1224         int r, fd, k;
1225         _cleanup_umask_ mode_t u;
1226         union {
1227                 struct cmsghdr cmsghdr;
1228                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1229         } control = {};
1230         struct msghdr mh = {
1231                 .msg_control = &control,
1232                 .msg_controllen = sizeof(control),
1233         };
1234         struct cmsghdr *cmsg;
1235
1236         assert(dest);
1237         assert(kmsg_socket >= 0);
1238
1239         u = umask(0000);
1240
1241         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1242          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1243          * on the reading side behave very similar to /proc/kmsg,
1244          * their writing side behaves differently from /dev/kmsg in
1245          * that writing blocks when nothing is reading. In order to
1246          * avoid any problems with containers deadlocking due to this
1247          * we simply make /dev/kmsg unavailable to the container. */
1248         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1249             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1250                 return log_oom();
1251
1252         if (mkfifo(from, 0600) < 0) {
1253                 log_error("mkfifo() for /dev/kmsg failed: %m");
1254                 return -errno;
1255         }
1256
1257         r = chmod_and_chown(from, 0600, 0, 0);
1258         if (r < 0) {
1259                 log_error_errno(-r, "Failed to correct access mode for /dev/kmsg: %m");
1260                 return r;
1261         }
1262
1263         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1264                 log_error("Bind mount for /proc/kmsg failed: %m");
1265                 return -errno;
1266         }
1267
1268         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1269         if (fd < 0) {
1270                 log_error("Failed to open fifo: %m");
1271                 return -errno;
1272         }
1273
1274         cmsg = CMSG_FIRSTHDR(&mh);
1275         cmsg->cmsg_level = SOL_SOCKET;
1276         cmsg->cmsg_type = SCM_RIGHTS;
1277         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1278         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1279
1280         mh.msg_controllen = cmsg->cmsg_len;
1281
1282         /* Store away the fd in the socket, so that it stays open as
1283          * long as we run the child */
1284         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1285         safe_close(fd);
1286
1287         if (k < 0) {
1288                 log_error("Failed to send FIFO fd: %m");
1289                 return -errno;
1290         }
1291
1292         /* And now make the FIFO unavailable as /dev/kmsg... */
1293         unlink(from);
1294         return 0;
1295 }
1296
1297 static int setup_hostname(void) {
1298
1299         if (arg_share_system)
1300                 return 0;
1301
1302         if (sethostname_idempotent(arg_machine) < 0)
1303                 return -errno;
1304
1305         return 0;
1306 }
1307
1308 static int setup_journal(const char *directory) {
1309         sd_id128_t machine_id, this_id;
1310         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1311         char *id;
1312         int r;
1313
1314         p = strappend(directory, "/etc/machine-id");
1315         if (!p)
1316                 return log_oom();
1317
1318         r = read_one_line_file(p, &b);
1319         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1320                 return 0;
1321         else if (r < 0) {
1322                 log_error_errno(-r, "Failed to read machine ID from %s: %m", p);
1323                 return r;
1324         }
1325
1326         id = strstrip(b);
1327         if (isempty(id) && arg_link_journal == LINK_AUTO)
1328                 return 0;
1329
1330         /* Verify validity */
1331         r = sd_id128_from_string(id, &machine_id);
1332         if (r < 0) {
1333                 log_error_errno(-r, "Failed to parse machine ID from %s: %m", p);
1334                 return r;
1335         }
1336
1337         r = sd_id128_get_machine(&this_id);
1338         if (r < 0) {
1339                 log_error_errno(-r, "Failed to retrieve machine ID: %m");
1340                 return r;
1341         }
1342
1343         if (sd_id128_equal(machine_id, this_id)) {
1344                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1345                          "Host and machine ids are equal (%s): refusing to link journals", id);
1346                 if (arg_link_journal == LINK_AUTO)
1347                         return 0;
1348                 return
1349                         -EEXIST;
1350         }
1351
1352         if (arg_link_journal == LINK_NO)
1353                 return 0;
1354
1355         free(p);
1356         p = strappend("/var/log/journal/", id);
1357         q = strjoin(directory, "/var/log/journal/", id, NULL);
1358         if (!p || !q)
1359                 return log_oom();
1360
1361         if (path_is_mount_point(p, false) > 0) {
1362                 if (arg_link_journal != LINK_AUTO) {
1363                         log_error("%s: already a mount point, refusing to use for journal", p);
1364                         return -EEXIST;
1365                 }
1366
1367                 return 0;
1368         }
1369
1370         if (path_is_mount_point(q, false) > 0) {
1371                 if (arg_link_journal != LINK_AUTO) {
1372                         log_error("%s: already a mount point, refusing to use for journal", q);
1373                         return -EEXIST;
1374                 }
1375
1376                 return 0;
1377         }
1378
1379         r = readlink_and_make_absolute(p, &d);
1380         if (r >= 0) {
1381                 if ((arg_link_journal == LINK_GUEST ||
1382                      arg_link_journal == LINK_AUTO) &&
1383                     path_equal(d, q)) {
1384
1385                         r = mkdir_p(q, 0755);
1386                         if (r < 0)
1387                                 log_warning("Failed to create directory %s: %m", q);
1388                         return 0;
1389                 }
1390
1391                 if (unlink(p) < 0) {
1392                         log_error("Failed to remove symlink %s: %m", p);
1393                         return -errno;
1394                 }
1395         } else if (r == -EINVAL) {
1396
1397                 if (arg_link_journal == LINK_GUEST &&
1398                     rmdir(p) < 0) {
1399
1400                         if (errno == ENOTDIR) {
1401                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1402                                 return r;
1403                         } else {
1404                                 log_error("Failed to remove %s: %m", p);
1405                                 return -errno;
1406                         }
1407                 }
1408         } else if (r != -ENOENT) {
1409                 log_error("readlink(%s) failed: %m", p);
1410                 return r;
1411         }
1412
1413         if (arg_link_journal == LINK_GUEST) {
1414
1415                 if (symlink(q, p) < 0) {
1416                         if (arg_link_journal_try) {
1417                                 log_debug("Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1418                                 return 0;
1419                         } else {
1420                                 log_error("Failed to symlink %s to %s: %m", q, p);
1421                                 return -errno;
1422                         }
1423                 }
1424
1425                 r = mkdir_p(q, 0755);
1426                 if (r < 0)
1427                         log_warning("Failed to create directory %s: %m", q);
1428                 return 0;
1429         }
1430
1431         if (arg_link_journal == LINK_HOST) {
1432                 /* don't create parents here -- if the host doesn't have
1433                  * permanent journal set up, don't force it here */
1434                 r = mkdir(p, 0755);
1435                 if (r < 0) {
1436                         if (arg_link_journal_try) {
1437                                 log_debug("Failed to create %s, skipping journal setup: %m", p);
1438                                 return 0;
1439                         } else {
1440                                 log_error("Failed to create %s: %m", p);
1441                                 return r;
1442                         }
1443                 }
1444
1445         } else if (access(p, F_OK) < 0)
1446                 return 0;
1447
1448         if (dir_is_empty(q) == 0)
1449                 log_warning("%s is not empty, proceeding anyway.", q);
1450
1451         r = mkdir_p(q, 0755);
1452         if (r < 0) {
1453                 log_error("Failed to create %s: %m", q);
1454                 return r;
1455         }
1456
1457         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1458                 log_error("Failed to bind mount journal from host into guest: %m");
1459                 return -errno;
1460         }
1461
1462         return 0;
1463 }
1464
1465 static int drop_capabilities(void) {
1466         return capability_bounding_set_drop(~arg_retain, false);
1467 }
1468
1469 static int register_machine(pid_t pid, int local_ifindex) {
1470         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1471         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1472         int r;
1473
1474         if (!arg_register)
1475                 return 0;
1476
1477         r = sd_bus_default_system(&bus);
1478         if (r < 0) {
1479                 log_error_errno(-r, "Failed to open system bus: %m");
1480                 return r;
1481         }
1482
1483         if (arg_keep_unit) {
1484                 r = sd_bus_call_method(
1485                                 bus,
1486                                 "org.freedesktop.machine1",
1487                                 "/org/freedesktop/machine1",
1488                                 "org.freedesktop.machine1.Manager",
1489                                 "RegisterMachineWithNetwork",
1490                                 &error,
1491                                 NULL,
1492                                 "sayssusai",
1493                                 arg_machine,
1494                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1495                                 "nspawn",
1496                                 "container",
1497                                 (uint32_t) pid,
1498                                 strempty(arg_directory),
1499                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1500         } else {
1501                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1502
1503                 r = sd_bus_message_new_method_call(
1504                                 bus,
1505                                 &m,
1506                                 "org.freedesktop.machine1",
1507                                 "/org/freedesktop/machine1",
1508                                 "org.freedesktop.machine1.Manager",
1509                                 "CreateMachineWithNetwork");
1510                 if (r < 0) {
1511                         log_error_errno(-r, "Failed to create message: %m");
1512                         return r;
1513                 }
1514
1515                 r = sd_bus_message_append(
1516                                 m,
1517                                 "sayssusai",
1518                                 arg_machine,
1519                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1520                                 "nspawn",
1521                                 "container",
1522                                 (uint32_t) pid,
1523                                 strempty(arg_directory),
1524                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1525                 if (r < 0) {
1526                         log_error_errno(-r, "Failed to append message arguments: %m");
1527                         return r;
1528                 }
1529
1530                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1531                 if (r < 0) {
1532                         log_error_errno(-r, "Failed to open container: %m");
1533                         return r;
1534                 }
1535
1536                 if (!isempty(arg_slice)) {
1537                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1538                         if (r < 0) {
1539                                 log_error_errno(-r, "Failed to append slice: %m");
1540                                 return r;
1541                         }
1542                 }
1543
1544                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1545                 if (r < 0) {
1546                         log_error_errno(-r, "Failed to add device policy: %m");
1547                         return r;
1548                 }
1549
1550                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1551                                           /* Allow the container to
1552                                            * access and create the API
1553                                            * device nodes, so that
1554                                            * PrivateDevices= in the
1555                                            * container can work
1556                                            * fine */
1557                                           "/dev/null", "rwm",
1558                                           "/dev/zero", "rwm",
1559                                           "/dev/full", "rwm",
1560                                           "/dev/random", "rwm",
1561                                           "/dev/urandom", "rwm",
1562                                           "/dev/tty", "rwm",
1563                                           "/dev/net/tun", "rwm",
1564                                           /* Allow the container
1565                                            * access to ptys. However,
1566                                            * do not permit the
1567                                            * container to ever create
1568                                            * these device nodes. */
1569                                           "/dev/pts/ptmx", "rw",
1570                                           "char-pts", "rw");
1571                 if (r < 0) {
1572                         log_error_errno(-r, "Failed to add device whitelist: %m");
1573                         return r;
1574                 }
1575
1576                 r = sd_bus_message_close_container(m);
1577                 if (r < 0) {
1578                         log_error_errno(-r, "Failed to close container: %m");
1579                         return r;
1580                 }
1581
1582                 r = sd_bus_call(bus, m, 0, &error, NULL);
1583         }
1584
1585         if (r < 0) {
1586                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1587                 return r;
1588         }
1589
1590         return 0;
1591 }
1592
1593 static int terminate_machine(pid_t pid) {
1594         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1595         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1596         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1597         const char *path;
1598         int r;
1599
1600         if (!arg_register)
1601                 return 0;
1602
1603         r = sd_bus_default_system(&bus);
1604         if (r < 0) {
1605                 log_error_errno(-r, "Failed to open system bus: %m");
1606                 return r;
1607         }
1608
1609         r = sd_bus_call_method(
1610                         bus,
1611                         "org.freedesktop.machine1",
1612                         "/org/freedesktop/machine1",
1613                         "org.freedesktop.machine1.Manager",
1614                         "GetMachineByPID",
1615                         &error,
1616                         &reply,
1617                         "u",
1618                         (uint32_t) pid);
1619         if (r < 0) {
1620                 /* Note that the machine might already have been
1621                  * cleaned up automatically, hence don't consider it a
1622                  * failure if we cannot get the machine object. */
1623                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1624                 return 0;
1625         }
1626
1627         r = sd_bus_message_read(reply, "o", &path);
1628         if (r < 0)
1629                 return bus_log_parse_error(r);
1630
1631         r = sd_bus_call_method(
1632                         bus,
1633                         "org.freedesktop.machine1",
1634                         path,
1635                         "org.freedesktop.machine1.Machine",
1636                         "Terminate",
1637                         &error,
1638                         NULL,
1639                         NULL);
1640         if (r < 0) {
1641                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1642                 return 0;
1643         }
1644
1645         return 0;
1646 }
1647
1648 static int reset_audit_loginuid(void) {
1649         _cleanup_free_ char *p = NULL;
1650         int r;
1651
1652         if (arg_share_system)
1653                 return 0;
1654
1655         r = read_one_line_file("/proc/self/loginuid", &p);
1656         if (r == -ENOENT)
1657                 return 0;
1658         if (r < 0) {
1659                 log_error_errno(-r, "Failed to read /proc/self/loginuid: %m");
1660                 return r;
1661         }
1662
1663         /* Already reset? */
1664         if (streq(p, "4294967295"))
1665                 return 0;
1666
1667         r = write_string_file("/proc/self/loginuid", "4294967295");
1668         if (r < 0) {
1669                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1670                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1671                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1672                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1673                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1674
1675                 sleep(5);
1676         }
1677
1678         return 0;
1679 }
1680
1681 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1682 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1683
1684 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1685         int r;
1686
1687         uint8_t result[8];
1688         size_t l, sz;
1689         uint8_t *v;
1690
1691         l = strlen(arg_machine);
1692         sz = sizeof(sd_id128_t) + l;
1693         v = alloca(sz);
1694
1695         /* fetch some persistent data unique to the host */
1696         r = sd_id128_get_machine((sd_id128_t*) v);
1697         if (r < 0)
1698                 return r;
1699
1700         /* combine with some data unique (on this host) to this
1701          * container instance */
1702         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1703
1704         /* Let's hash the host machine ID plus the container name. We
1705          * use a fixed, but originally randomly created hash key here. */
1706         siphash24(result, v, sz, hash_key.bytes);
1707
1708         assert_cc(ETH_ALEN <= sizeof(result));
1709         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1710
1711         /* see eth_random_addr in the kernel */
1712         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1713         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1714
1715         return 0;
1716 }
1717
1718 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1719         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1720         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1721         struct ether_addr mac_host, mac_container;
1722         int r, i;
1723
1724         if (!arg_private_network)
1725                 return 0;
1726
1727         if (!arg_network_veth)
1728                 return 0;
1729
1730         /* Use two different interface name prefixes depending whether
1731          * we are in bridge mode or not. */
1732         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1733                  arg_network_bridge ? "vb" : "ve", arg_machine);
1734
1735         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1736         if (r < 0) {
1737                 log_error("Failed to generate predictable MAC address for container side");
1738                 return r;
1739         }
1740
1741         r = generate_mac(&mac_host, HOST_HASH_KEY);
1742         if (r < 0) {
1743                 log_error("Failed to generate predictable MAC address for host side");
1744                 return r;
1745         }
1746
1747         r = sd_rtnl_open(&rtnl, 0);
1748         if (r < 0) {
1749                 log_error_errno(-r, "Failed to connect to netlink: %m");
1750                 return r;
1751         }
1752
1753         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1754         if (r < 0) {
1755                 log_error_errno(-r, "Failed to allocate netlink message: %m");
1756                 return r;
1757         }
1758
1759         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1760         if (r < 0) {
1761                 log_error_errno(-r, "Failed to add netlink interface name: %m");
1762                 return r;
1763         }
1764
1765         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1766         if (r < 0) {
1767                 log_error_errno(-r, "Failed to add netlink MAC address: %m");
1768                 return r;
1769         }
1770
1771         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1772         if (r < 0) {
1773                 log_error_errno(-r, "Failed to open netlink container: %m");
1774                 return r;
1775         }
1776
1777         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1778         if (r < 0) {
1779                 log_error_errno(-r, "Failed to open netlink container: %m");
1780                 return r;
1781         }
1782
1783         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1784         if (r < 0) {
1785                 log_error_errno(-r, "Failed to open netlink container: %m");
1786                 return r;
1787         }
1788
1789         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1790         if (r < 0) {
1791                 log_error_errno(-r, "Failed to add netlink interface name: %m");
1792                 return r;
1793         }
1794
1795         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1796         if (r < 0) {
1797                 log_error_errno(-r, "Failed to add netlink MAC address: %m");
1798                 return r;
1799         }
1800
1801         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1802         if (r < 0) {
1803                 log_error_errno(-r, "Failed to add netlink namespace field: %m");
1804                 return r;
1805         }
1806
1807         r = sd_rtnl_message_close_container(m);
1808         if (r < 0) {
1809                 log_error_errno(-r, "Failed to close netlink container: %m");
1810                 return r;
1811         }
1812
1813         r = sd_rtnl_message_close_container(m);
1814         if (r < 0) {
1815                 log_error_errno(-r, "Failed to close netlink container: %m");
1816                 return r;
1817         }
1818
1819         r = sd_rtnl_message_close_container(m);
1820         if (r < 0) {
1821                 log_error_errno(-r, "Failed to close netlink container: %m");
1822                 return r;
1823         }
1824
1825         r = sd_rtnl_call(rtnl, m, 0, NULL);
1826         if (r < 0) {
1827                 log_error_errno(-r, "Failed to add new veth interfaces: %m");
1828                 return r;
1829         }
1830
1831         i = (int) if_nametoindex(iface_name);
1832         if (i <= 0) {
1833                 log_error("Failed to resolve interface %s: %m", iface_name);
1834                 return -errno;
1835         }
1836
1837         *ifi = i;
1838
1839         return 0;
1840 }
1841
1842 static int setup_bridge(const char veth_name[], int *ifi) {
1843         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1844         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1845         int r, bridge;
1846
1847         if (!arg_private_network)
1848                 return 0;
1849
1850         if (!arg_network_veth)
1851                 return 0;
1852
1853         if (!arg_network_bridge)
1854                 return 0;
1855
1856         bridge = (int) if_nametoindex(arg_network_bridge);
1857         if (bridge <= 0) {
1858                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1859                 return -errno;
1860         }
1861
1862         *ifi = bridge;
1863
1864         r = sd_rtnl_open(&rtnl, 0);
1865         if (r < 0) {
1866                 log_error_errno(-r, "Failed to connect to netlink: %m");
1867                 return r;
1868         }
1869
1870         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1871         if (r < 0) {
1872                 log_error_errno(-r, "Failed to allocate netlink message: %m");
1873                 return r;
1874         }
1875
1876         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1877         if (r < 0) {
1878                 log_error_errno(-r, "Failed to set IFF_UP flag: %m");
1879                 return r;
1880         }
1881
1882         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1883         if (r < 0) {
1884                 log_error_errno(-r, "Failed to add netlink interface name field: %m");
1885                 return r;
1886         }
1887
1888         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1889         if (r < 0) {
1890                 log_error_errno(-r, "Failed to add netlink master field: %m");
1891                 return r;
1892         }
1893
1894         r = sd_rtnl_call(rtnl, m, 0, NULL);
1895         if (r < 0) {
1896                 log_error_errno(-r, "Failed to add veth interface to bridge: %m");
1897                 return r;
1898         }
1899
1900         return 0;
1901 }
1902
1903 static int parse_interface(struct udev *udev, const char *name) {
1904         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1905         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1906         int ifi;
1907
1908         ifi = (int) if_nametoindex(name);
1909         if (ifi <= 0) {
1910                 log_error("Failed to resolve interface %s: %m", name);
1911                 return -errno;
1912         }
1913
1914         sprintf(ifi_str, "n%i", ifi);
1915         d = udev_device_new_from_device_id(udev, ifi_str);
1916         if (!d) {
1917                 log_error("Failed to get udev device for interface %s: %m", name);
1918                 return -errno;
1919         }
1920
1921         if (udev_device_get_is_initialized(d) <= 0) {
1922                 log_error("Network interface %s is not initialized yet.", name);
1923                 return -EBUSY;
1924         }
1925
1926         return ifi;
1927 }
1928
1929 static int move_network_interfaces(pid_t pid) {
1930         _cleanup_udev_unref_ struct udev *udev = NULL;
1931         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1932         char **i;
1933         int r;
1934
1935         if (!arg_private_network)
1936                 return 0;
1937
1938         if (strv_isempty(arg_network_interfaces))
1939                 return 0;
1940
1941         r = sd_rtnl_open(&rtnl, 0);
1942         if (r < 0) {
1943                 log_error_errno(-r, "Failed to connect to netlink: %m");
1944                 return r;
1945         }
1946
1947         udev = udev_new();
1948         if (!udev) {
1949                 log_error("Failed to connect to udev.");
1950                 return -ENOMEM;
1951         }
1952
1953         STRV_FOREACH(i, arg_network_interfaces) {
1954                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1955                 int ifi;
1956
1957                 ifi = parse_interface(udev, *i);
1958                 if (ifi < 0)
1959                         return ifi;
1960
1961                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1962                 if (r < 0) {
1963                         log_error_errno(-r, "Failed to allocate netlink message: %m");
1964                         return r;
1965                 }
1966
1967                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1968                 if (r < 0) {
1969                         log_error_errno(-r, "Failed to append namespace PID to netlink message: %m");
1970                         return r;
1971                 }
1972
1973                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1974                 if (r < 0) {
1975                         log_error_errno(-r, "Failed to move interface %s to namespace: %m", *i);
1976                         return r;
1977                 }
1978         }
1979
1980         return 0;
1981 }
1982
1983 static int setup_macvlan(pid_t pid) {
1984         _cleanup_udev_unref_ struct udev *udev = NULL;
1985         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1986         char **i;
1987         int r;
1988
1989         if (!arg_private_network)
1990                 return 0;
1991
1992         if (strv_isempty(arg_network_macvlan))
1993                 return 0;
1994
1995         r = sd_rtnl_open(&rtnl, 0);
1996         if (r < 0) {
1997                 log_error_errno(-r, "Failed to connect to netlink: %m");
1998                 return r;
1999         }
2000
2001         udev = udev_new();
2002         if (!udev) {
2003                 log_error("Failed to connect to udev.");
2004                 return -ENOMEM;
2005         }
2006
2007         STRV_FOREACH(i, arg_network_macvlan) {
2008                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2009                 _cleanup_free_ char *n = NULL;
2010                 int ifi;
2011
2012                 ifi = parse_interface(udev, *i);
2013                 if (ifi < 0)
2014                         return ifi;
2015
2016                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2017                 if (r < 0) {
2018                         log_error_errno(-r, "Failed to allocate netlink message: %m");
2019                         return r;
2020                 }
2021
2022                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2023                 if (r < 0) {
2024                         log_error_errno(-r, "Failed to add netlink interface index: %m");
2025                         return r;
2026                 }
2027
2028                 n = strappend("mv-", *i);
2029                 if (!n)
2030                         return log_oom();
2031
2032                 strshorten(n, IFNAMSIZ-1);
2033
2034                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2035                 if (r < 0) {
2036                         log_error_errno(-r, "Failed to add netlink interface name: %m");
2037                         return r;
2038                 }
2039
2040                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2041                 if (r < 0) {
2042                         log_error_errno(-r, "Failed to add netlink namespace field: %m");
2043                         return r;
2044                 }
2045
2046                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2047                 if (r < 0) {
2048                         log_error_errno(-r, "Failed to open netlink container: %m");
2049                         return r;
2050                 }
2051
2052                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2053                 if (r < 0) {
2054                         log_error_errno(-r, "Failed to open netlink container: %m");
2055                         return r;
2056                 }
2057
2058                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2059                 if (r < 0) {
2060                         log_error_errno(-r, "Failed to append macvlan mode: %m");
2061                         return r;
2062                 }
2063
2064                 r = sd_rtnl_message_close_container(m);
2065                 if (r < 0) {
2066                         log_error_errno(-r, "Failed to close netlink container: %m");
2067                         return r;
2068                 }
2069
2070                 r = sd_rtnl_message_close_container(m);
2071                 if (r < 0) {
2072                         log_error_errno(-r, "Failed to close netlink container: %m");
2073                         return r;
2074                 }
2075
2076                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2077                 if (r < 0) {
2078                         log_error_errno(-r, "Failed to add new macvlan interfaces: %m");
2079                         return r;
2080                 }
2081         }
2082
2083         return 0;
2084 }
2085
2086 static int setup_seccomp(void) {
2087
2088 #ifdef HAVE_SECCOMP
2089         static const int blacklist[] = {
2090                 SCMP_SYS(kexec_load),
2091                 SCMP_SYS(open_by_handle_at),
2092                 SCMP_SYS(init_module),
2093                 SCMP_SYS(finit_module),
2094                 SCMP_SYS(delete_module),
2095                 SCMP_SYS(iopl),
2096                 SCMP_SYS(ioperm),
2097                 SCMP_SYS(swapon),
2098                 SCMP_SYS(swapoff),
2099         };
2100
2101         scmp_filter_ctx seccomp;
2102         unsigned i;
2103         int r;
2104
2105         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2106         if (!seccomp)
2107                 return log_oom();
2108
2109         r = seccomp_add_secondary_archs(seccomp);
2110         if (r < 0) {
2111                 log_error_errno(-r, "Failed to add secondary archs to seccomp filter: %m");
2112                 goto finish;
2113         }
2114
2115         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2116                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2117                 if (r == -EFAULT)
2118                         continue; /* unknown syscall */
2119                 if (r < 0) {
2120                         log_error_errno(-r, "Failed to block syscall: %m");
2121                         goto finish;
2122                 }
2123         }
2124
2125         /*
2126            Audit is broken in containers, much of the userspace audit
2127            hookup will fail if running inside a container. We don't
2128            care and just turn off creation of audit sockets.
2129
2130            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2131            with EAFNOSUPPORT which audit userspace uses as indication
2132            that audit is disabled in the kernel.
2133          */
2134
2135         r = seccomp_rule_add(
2136                         seccomp,
2137                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2138                         SCMP_SYS(socket),
2139                         2,
2140                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2141                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2142         if (r < 0) {
2143                 log_error_errno(-r, "Failed to add audit seccomp rule: %m");
2144                 goto finish;
2145         }
2146
2147         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2148         if (r < 0) {
2149                 log_error_errno(-r, "Failed to unset NO_NEW_PRIVS: %m");
2150                 goto finish;
2151         }
2152
2153         r = seccomp_load(seccomp);
2154         if (r < 0)
2155                 log_error_errno(-r, "Failed to install seccomp audit filter: %m");
2156
2157 finish:
2158         seccomp_release(seccomp);
2159         return r;
2160 #else
2161         return 0;
2162 #endif
2163
2164 }
2165
2166 static int setup_image(char **device_path, int *loop_nr) {
2167         struct loop_info64 info = {
2168                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2169         };
2170         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2171         _cleanup_free_ char* loopdev = NULL;
2172         struct stat st;
2173         int r, nr;
2174
2175         assert(device_path);
2176         assert(loop_nr);
2177
2178         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2179         if (fd < 0) {
2180                 log_error("Failed to open %s: %m", arg_image);
2181                 return -errno;
2182         }
2183
2184         if (fstat(fd, &st) < 0) {
2185                 log_error("Failed to stat %s: %m", arg_image);
2186                 return -errno;
2187         }
2188
2189         if (S_ISBLK(st.st_mode)) {
2190                 char *p;
2191
2192                 p = strdup(arg_image);
2193                 if (!p)
2194                         return log_oom();
2195
2196                 *device_path = p;
2197
2198                 *loop_nr = -1;
2199
2200                 r = fd;
2201                 fd = -1;
2202
2203                 return r;
2204         }
2205
2206         if (!S_ISREG(st.st_mode)) {
2207                 log_error("%s is not a regular file or block device: %m", arg_image);
2208                 return -EINVAL;
2209         }
2210
2211         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2212         if (control < 0) {
2213                 log_error("Failed to open /dev/loop-control: %m");
2214                 return -errno;
2215         }
2216
2217         nr = ioctl(control, LOOP_CTL_GET_FREE);
2218         if (nr < 0) {
2219                 log_error("Failed to allocate loop device: %m");
2220                 return -errno;
2221         }
2222
2223         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2224                 return log_oom();
2225
2226         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2227         if (loop < 0) {
2228                 log_error("Failed to open loop device %s: %m", loopdev);
2229                 return -errno;
2230         }
2231
2232         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2233                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2234                 return -errno;
2235         }
2236
2237         if (arg_read_only)
2238                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2239
2240         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2241                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2242                 return -errno;
2243         }
2244
2245         *device_path = loopdev;
2246         loopdev = NULL;
2247
2248         *loop_nr = nr;
2249
2250         r = loop;
2251         loop = -1;
2252
2253         return r;
2254 }
2255
2256 static int dissect_image(
2257                 int fd,
2258                 char **root_device, bool *root_device_rw,
2259                 char **home_device, bool *home_device_rw,
2260                 char **srv_device, bool *srv_device_rw,
2261                 bool *secondary) {
2262
2263 #ifdef HAVE_BLKID
2264         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2265         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2266         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2267         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2268         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2269         _cleanup_udev_unref_ struct udev *udev = NULL;
2270         struct udev_list_entry *first, *item;
2271         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2272         const char *pttype = NULL;
2273         blkid_partlist pl;
2274         struct stat st;
2275         int r;
2276
2277         assert(fd >= 0);
2278         assert(root_device);
2279         assert(home_device);
2280         assert(srv_device);
2281         assert(secondary);
2282
2283         b = blkid_new_probe();
2284         if (!b)
2285                 return log_oom();
2286
2287         errno = 0;
2288         r = blkid_probe_set_device(b, fd, 0, 0);
2289         if (r != 0) {
2290                 if (errno == 0)
2291                         return log_oom();
2292
2293                 log_error("Failed to set device on blkid probe: %m");
2294                 return -errno;
2295         }
2296
2297         blkid_probe_enable_partitions(b, 1);
2298         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2299
2300         errno = 0;
2301         r = blkid_do_safeprobe(b);
2302         if (r == -2 || r == 1) {
2303                 log_error("Failed to identify any partition table on %s.\n"
2304                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2305                 return -EINVAL;
2306         } else if (r != 0) {
2307                 if (errno == 0)
2308                         errno = EIO;
2309                 log_error("Failed to probe: %m");
2310                 return -errno;
2311         }
2312
2313         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2314         if (!streq_ptr(pttype, "gpt")) {
2315                 log_error("Image %s does not carry a GUID Partition Table.\n"
2316                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2317                 return -EINVAL;
2318         }
2319
2320         errno = 0;
2321         pl = blkid_probe_get_partitions(b);
2322         if (!pl) {
2323                 if (errno == 0)
2324                         return log_oom();
2325
2326                 log_error("Failed to list partitions of %s", arg_image);
2327                 return -errno;
2328         }
2329
2330         udev = udev_new();
2331         if (!udev)
2332                 return log_oom();
2333
2334         if (fstat(fd, &st) < 0) {
2335                 log_error("Failed to stat block device: %m");
2336                 return -errno;
2337         }
2338
2339         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2340         if (!d)
2341                 return log_oom();
2342
2343         e = udev_enumerate_new(udev);
2344         if (!e)
2345                 return log_oom();
2346
2347         r = udev_enumerate_add_match_parent(e, d);
2348         if (r < 0)
2349                 return log_oom();
2350
2351         r = udev_enumerate_scan_devices(e);
2352         if (r < 0) {
2353                 log_error_errno(-r, "Failed to scan for partition devices of %s: %m", arg_image);
2354                 return r;
2355         }
2356
2357         first = udev_enumerate_get_list_entry(e);
2358         udev_list_entry_foreach(item, first) {
2359                 _cleanup_udev_device_unref_ struct udev_device *q;
2360                 const char *stype, *node;
2361                 unsigned long long flags;
2362                 sd_id128_t type_id;
2363                 blkid_partition pp;
2364                 dev_t qn;
2365                 int nr;
2366
2367                 errno = 0;
2368                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2369                 if (!q) {
2370                         if (!errno)
2371                                 errno = ENOMEM;
2372
2373                         log_error("Failed to get partition device of %s: %m", arg_image);
2374                         return -errno;
2375                 }
2376
2377                 qn = udev_device_get_devnum(q);
2378                 if (major(qn) == 0)
2379                         continue;
2380
2381                 if (st.st_rdev == qn)
2382                         continue;
2383
2384                 node = udev_device_get_devnode(q);
2385                 if (!node)
2386                         continue;
2387
2388                 pp = blkid_partlist_devno_to_partition(pl, qn);
2389                 if (!pp)
2390                         continue;
2391
2392                 flags = blkid_partition_get_flags(pp);
2393                 if (flags & GPT_FLAG_NO_AUTO)
2394                         continue;
2395
2396                 nr = blkid_partition_get_partno(pp);
2397                 if (nr < 0)
2398                         continue;
2399
2400                 stype = blkid_partition_get_type_string(pp);
2401                 if (!stype)
2402                         continue;
2403
2404                 if (sd_id128_from_string(stype, &type_id) < 0)
2405                         continue;
2406
2407                 if (sd_id128_equal(type_id, GPT_HOME)) {
2408
2409                         if (home && nr >= home_nr)
2410                                 continue;
2411
2412                         home_nr = nr;
2413                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2414
2415                         free(home);
2416                         home = strdup(node);
2417                         if (!home)
2418                                 return log_oom();
2419                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2420
2421                         if (srv && nr >= srv_nr)
2422                                 continue;
2423
2424                         srv_nr = nr;
2425                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2426
2427                         free(srv);
2428                         srv = strdup(node);
2429                         if (!srv)
2430                                 return log_oom();
2431                 }
2432 #ifdef GPT_ROOT_NATIVE
2433                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2434
2435                         if (root && nr >= root_nr)
2436                                 continue;
2437
2438                         root_nr = nr;
2439                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2440
2441                         free(root);
2442                         root = strdup(node);
2443                         if (!root)
2444                                 return log_oom();
2445                 }
2446 #endif
2447 #ifdef GPT_ROOT_SECONDARY
2448                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2449
2450                         if (secondary_root && nr >= secondary_root_nr)
2451                                 continue;
2452
2453                         secondary_root_nr = nr;
2454                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2455
2456
2457                         free(secondary_root);
2458                         secondary_root = strdup(node);
2459                         if (!secondary_root)
2460                                 return log_oom();
2461                 }
2462 #endif
2463         }
2464
2465         if (!root && !secondary_root) {
2466                 log_error("Failed to identify root partition in disk image %s.\n"
2467                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2468                 return -EINVAL;
2469         }
2470
2471         if (root) {
2472                 *root_device = root;
2473                 root = NULL;
2474
2475                 *root_device_rw = root_rw;
2476                 *secondary = false;
2477         } else if (secondary_root) {
2478                 *root_device = secondary_root;
2479                 secondary_root = NULL;
2480
2481                 *root_device_rw = secondary_root_rw;
2482                 *secondary = true;
2483         }
2484
2485         if (home) {
2486                 *home_device = home;
2487                 home = NULL;
2488
2489                 *home_device_rw = home_rw;
2490         }
2491
2492         if (srv) {
2493                 *srv_device = srv;
2494                 srv = NULL;
2495
2496                 *srv_device_rw = srv_rw;
2497         }
2498
2499         return 0;
2500 #else
2501         log_error("--image= is not supported, compiled without blkid support.");
2502         return -ENOTSUP;
2503 #endif
2504 }
2505
2506 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2507 #ifdef HAVE_BLKID
2508         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2509         const char *fstype, *p;
2510         int r;
2511
2512         assert(what);
2513         assert(where);
2514
2515         if (arg_read_only)
2516                 rw = false;
2517
2518         if (directory)
2519                 p = strappenda(where, directory);
2520         else
2521                 p = where;
2522
2523         errno = 0;
2524         b = blkid_new_probe_from_filename(what);
2525         if (!b) {
2526                 if (errno == 0)
2527                         return log_oom();
2528                 log_error("Failed to allocate prober for %s: %m", what);
2529                 return -errno;
2530         }
2531
2532         blkid_probe_enable_superblocks(b, 1);
2533         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2534
2535         errno = 0;
2536         r = blkid_do_safeprobe(b);
2537         if (r == -1 || r == 1) {
2538                 log_error("Cannot determine file system type of %s", what);
2539                 return -EINVAL;
2540         } else if (r != 0) {
2541                 if (errno == 0)
2542                         errno = EIO;
2543                 log_error("Failed to probe %s: %m", what);
2544                 return -errno;
2545         }
2546
2547         errno = 0;
2548         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2549                 if (errno == 0)
2550                         errno = EINVAL;
2551                 log_error("Failed to determine file system type of %s", what);
2552                 return -errno;
2553         }
2554
2555         if (streq(fstype, "crypto_LUKS")) {
2556                 log_error("nspawn currently does not support LUKS disk images.");
2557                 return -ENOTSUP;
2558         }
2559
2560         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2561                 log_error("Failed to mount %s: %m", what);
2562                 return -errno;
2563         }
2564
2565         return 0;
2566 #else
2567         log_error("--image= is not supported, compiled without blkid support.");
2568         return -ENOTSUP;
2569 #endif
2570 }
2571
2572 static int mount_devices(
2573                 const char *where,
2574                 const char *root_device, bool root_device_rw,
2575                 const char *home_device, bool home_device_rw,
2576                 const char *srv_device, bool srv_device_rw) {
2577         int r;
2578
2579         assert(where);
2580
2581         if (root_device) {
2582                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2583                 if (r < 0) {
2584                         log_error_errno(-r, "Failed to mount root directory: %m");
2585                         return r;
2586                 }
2587         }
2588
2589         if (home_device) {
2590                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2591                 if (r < 0) {
2592                         log_error_errno(-r, "Failed to mount home directory: %m");
2593                         return r;
2594                 }
2595         }
2596
2597         if (srv_device) {
2598                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2599                 if (r < 0) {
2600                         log_error_errno(-r, "Failed to mount server data directory: %m");
2601                         return r;
2602                 }
2603         }
2604
2605         return 0;
2606 }
2607
2608 static void loop_remove(int nr, int *image_fd) {
2609         _cleanup_close_ int control = -1;
2610         int r;
2611
2612         if (nr < 0)
2613                 return;
2614
2615         if (image_fd && *image_fd >= 0) {
2616                 r = ioctl(*image_fd, LOOP_CLR_FD);
2617                 if (r < 0)
2618                         log_warning("Failed to close loop image: %m");
2619                 *image_fd = safe_close(*image_fd);
2620         }
2621
2622         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2623         if (control < 0) {
2624                 log_warning("Failed to open /dev/loop-control: %m");
2625                 return;
2626         }
2627
2628         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2629         if (r < 0)
2630                 log_warning("Failed to remove loop %d: %m", nr);
2631 }
2632
2633 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2634         int pipe_fds[2];
2635         pid_t pid;
2636
2637         assert(database);
2638         assert(key);
2639         assert(rpid);
2640
2641         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2642                 log_error("Failed to allocate pipe: %m");
2643                 return -errno;
2644         }
2645
2646         pid = fork();
2647         if (pid < 0) {
2648                 log_error("Failed to fork getent child: %m");
2649                 return -errno;
2650         } else if (pid == 0) {
2651                 int nullfd;
2652                 char *empty_env = NULL;
2653
2654                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2655                         _exit(EXIT_FAILURE);
2656
2657                 if (pipe_fds[0] > 2)
2658                         safe_close(pipe_fds[0]);
2659                 if (pipe_fds[1] > 2)
2660                         safe_close(pipe_fds[1]);
2661
2662                 nullfd = open("/dev/null", O_RDWR);
2663                 if (nullfd < 0)
2664                         _exit(EXIT_FAILURE);
2665
2666                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2667                         _exit(EXIT_FAILURE);
2668
2669                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2670                         _exit(EXIT_FAILURE);
2671
2672                 if (nullfd > 2)
2673                         safe_close(nullfd);
2674
2675                 reset_all_signal_handlers();
2676                 close_all_fds(NULL, 0);
2677
2678                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2679                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2680                 _exit(EXIT_FAILURE);
2681         }
2682
2683         pipe_fds[1] = safe_close(pipe_fds[1]);
2684
2685         *rpid = pid;
2686
2687         return pipe_fds[0];
2688 }
2689
2690 static int change_uid_gid(char **_home) {
2691         char line[LINE_MAX], *x, *u, *g, *h;
2692         const char *word, *state;
2693         _cleanup_free_ uid_t *uids = NULL;
2694         _cleanup_free_ char *home = NULL;
2695         _cleanup_fclose_ FILE *f = NULL;
2696         _cleanup_close_ int fd = -1;
2697         unsigned n_uids = 0;
2698         size_t sz = 0, l;
2699         uid_t uid;
2700         gid_t gid;
2701         pid_t pid;
2702         int r;
2703
2704         assert(_home);
2705
2706         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2707                 /* Reset everything fully to 0, just in case */
2708
2709                 if (setgroups(0, NULL) < 0) {
2710                         log_error("setgroups() failed: %m");
2711                         return -errno;
2712                 }
2713
2714                 if (setresgid(0, 0, 0) < 0) {
2715                         log_error("setregid() failed: %m");
2716                         return -errno;
2717                 }
2718
2719                 if (setresuid(0, 0, 0) < 0) {
2720                         log_error("setreuid() failed: %m");
2721                         return -errno;
2722                 }
2723
2724                 *_home = NULL;
2725                 return 0;
2726         }
2727
2728         /* First, get user credentials */
2729         fd = spawn_getent("passwd", arg_user, &pid);
2730         if (fd < 0)
2731                 return fd;
2732
2733         f = fdopen(fd, "r");
2734         if (!f)
2735                 return log_oom();
2736         fd = -1;
2737
2738         if (!fgets(line, sizeof(line), f)) {
2739
2740                 if (!ferror(f)) {
2741                         log_error("Failed to resolve user %s.", arg_user);
2742                         return -ESRCH;
2743                 }
2744
2745                 log_error("Failed to read from getent: %m");
2746                 return -errno;
2747         }
2748
2749         truncate_nl(line);
2750
2751         wait_for_terminate_and_warn("getent passwd", pid);
2752
2753         x = strchr(line, ':');
2754         if (!x) {
2755                 log_error("/etc/passwd entry has invalid user field.");
2756                 return -EIO;
2757         }
2758
2759         u = strchr(x+1, ':');
2760         if (!u) {
2761                 log_error("/etc/passwd entry has invalid password field.");
2762                 return -EIO;
2763         }
2764
2765         u++;
2766         g = strchr(u, ':');
2767         if (!g) {
2768                 log_error("/etc/passwd entry has invalid UID field.");
2769                 return -EIO;
2770         }
2771
2772         *g = 0;
2773         g++;
2774         x = strchr(g, ':');
2775         if (!x) {
2776                 log_error("/etc/passwd entry has invalid GID field.");
2777                 return -EIO;
2778         }
2779
2780         *x = 0;
2781         h = strchr(x+1, ':');
2782         if (!h) {
2783                 log_error("/etc/passwd entry has invalid GECOS field.");
2784                 return -EIO;
2785         }
2786
2787         h++;
2788         x = strchr(h, ':');
2789         if (!x) {
2790                 log_error("/etc/passwd entry has invalid home directory field.");
2791                 return -EIO;
2792         }
2793
2794         *x = 0;
2795
2796         r = parse_uid(u, &uid);
2797         if (r < 0) {
2798                 log_error("Failed to parse UID of user.");
2799                 return -EIO;
2800         }
2801
2802         r = parse_gid(g, &gid);
2803         if (r < 0) {
2804                 log_error("Failed to parse GID of user.");
2805                 return -EIO;
2806         }
2807
2808         home = strdup(h);
2809         if (!home)
2810                 return log_oom();
2811
2812         /* Second, get group memberships */
2813         fd = spawn_getent("initgroups", arg_user, &pid);
2814         if (fd < 0)
2815                 return fd;
2816
2817         fclose(f);
2818         f = fdopen(fd, "r");
2819         if (!f)
2820                 return log_oom();
2821         fd = -1;
2822
2823         if (!fgets(line, sizeof(line), f)) {
2824                 if (!ferror(f)) {
2825                         log_error("Failed to resolve user %s.", arg_user);
2826                         return -ESRCH;
2827                 }
2828
2829                 log_error("Failed to read from getent: %m");
2830                 return -errno;
2831         }
2832
2833         truncate_nl(line);
2834
2835         wait_for_terminate_and_warn("getent initgroups", pid);
2836
2837         /* Skip over the username and subsequent separator whitespace */
2838         x = line;
2839         x += strcspn(x, WHITESPACE);
2840         x += strspn(x, WHITESPACE);
2841
2842         FOREACH_WORD(word, l, x, state) {
2843                 char c[l+1];
2844
2845                 memcpy(c, word, l);
2846                 c[l] = 0;
2847
2848                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2849                         return log_oom();
2850
2851                 r = parse_uid(c, &uids[n_uids++]);
2852                 if (r < 0) {
2853                         log_error("Failed to parse group data from getent.");
2854                         return -EIO;
2855                 }
2856         }
2857
2858         r = mkdir_parents(home, 0775);
2859         if (r < 0) {
2860                 log_error_errno(-r, "Failed to make home root directory: %m");
2861                 return r;
2862         }
2863
2864         r = mkdir_safe(home, 0755, uid, gid);
2865         if (r < 0 && r != -EEXIST) {
2866                 log_error_errno(-r, "Failed to make home directory: %m");
2867                 return r;
2868         }
2869
2870         fchown(STDIN_FILENO, uid, gid);
2871         fchown(STDOUT_FILENO, uid, gid);
2872         fchown(STDERR_FILENO, uid, gid);
2873
2874         if (setgroups(n_uids, uids) < 0) {
2875                 log_error("Failed to set auxiliary groups: %m");
2876                 return -errno;
2877         }
2878
2879         if (setresgid(gid, gid, gid) < 0) {
2880                 log_error("setregid() failed: %m");
2881                 return -errno;
2882         }
2883
2884         if (setresuid(uid, uid, uid) < 0) {
2885                 log_error("setreuid() failed: %m");
2886                 return -errno;
2887         }
2888
2889         if (_home) {
2890                 *_home = home;
2891                 home = NULL;
2892         }
2893
2894         return 0;
2895 }
2896
2897 /*
2898  * Return values:
2899  * < 0 : wait_for_terminate() failed to get the state of the
2900  *       container, the container was terminated by a signal, or
2901  *       failed for an unknown reason.  No change is made to the
2902  *       container argument.
2903  * > 0 : The program executed in the container terminated with an
2904  *       error.  The exit code of the program executed in the
2905  *       container is returned.  The container argument has been set
2906  *       to CONTAINER_TERMINATED.
2907  *   0 : The container is being rebooted, has been shut down or exited
2908  *       successfully.  The container argument has been set to either
2909  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2910  *
2911  * That is, success is indicated by a return value of zero, and an
2912  * error is indicated by a non-zero value.
2913  */
2914 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2915         siginfo_t status;
2916         int r;
2917
2918         r = wait_for_terminate(pid, &status);
2919         if (r < 0) {
2920                 log_warning_errno(-r, "Failed to wait for container: %m");
2921                 return r;
2922         }
2923
2924         switch (status.si_code) {
2925
2926         case CLD_EXITED:
2927                 if (status.si_status == 0) {
2928                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2929
2930                 } else
2931                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2932
2933                 *container = CONTAINER_TERMINATED;
2934                 return status.si_status;
2935
2936         case CLD_KILLED:
2937                 if (status.si_status == SIGINT) {
2938
2939                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2940                         *container = CONTAINER_TERMINATED;
2941                         return 0;
2942
2943                 } else if (status.si_status == SIGHUP) {
2944
2945                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2946                         *container = CONTAINER_REBOOTED;
2947                         return 0;
2948                 }
2949
2950                 /* CLD_KILLED fallthrough */
2951
2952         case CLD_DUMPED:
2953                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2954                 return -EIO;
2955
2956         default:
2957                 log_error("Container %s failed due to unknown reason.", arg_machine);
2958                 return -EIO;
2959         }
2960
2961         return r;
2962 }
2963
2964 static void nop_handler(int sig) {}
2965
2966 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2967         pid_t pid;
2968
2969         pid = PTR_TO_UINT32(userdata);
2970         if (pid > 0) {
2971                 if (kill(pid, SIGRTMIN+3) >= 0) {
2972                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2973                         sd_event_source_set_userdata(s, NULL);
2974                         return 0;
2975                 }
2976         }
2977
2978         sd_event_exit(sd_event_source_get_event(s), 0);
2979         return 0;
2980 }
2981
2982 int main(int argc, char *argv[]) {
2983
2984         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2985         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2986         _cleanup_close_ int master = -1, image_fd = -1;
2987         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2988         _cleanup_fdset_free_ FDSet *fds = NULL;
2989         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2990         const char *console = NULL;
2991         char veth_name[IFNAMSIZ];
2992         bool secondary = false;
2993         sigset_t mask, mask_chld;
2994         pid_t pid = 0;
2995
2996         log_parse_environment();
2997         log_open();
2998
2999         k = parse_argv(argc, argv);
3000         if (k < 0)
3001                 goto finish;
3002         else if (k == 0) {
3003                 r = EXIT_SUCCESS;
3004                 goto finish;
3005         }
3006
3007         if (!arg_image) {
3008                 if (arg_directory) {
3009                         char *p;
3010
3011                         p = path_make_absolute_cwd(arg_directory);
3012                         free(arg_directory);
3013                         arg_directory = p;
3014                 } else
3015                         arg_directory = get_current_dir_name();
3016
3017                 if (!arg_directory) {
3018                         log_error("Failed to determine path, please use -D.");
3019                         goto finish;
3020                 }
3021                 path_kill_slashes(arg_directory);
3022         }
3023
3024         if (!arg_machine) {
3025                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3026                 if (!arg_machine) {
3027                         log_oom();
3028                         goto finish;
3029                 }
3030
3031                 hostname_cleanup(arg_machine, false);
3032                 if (isempty(arg_machine)) {
3033                         log_error("Failed to determine machine name automatically, please use -M.");
3034                         goto finish;
3035                 }
3036         }
3037
3038         if (geteuid() != 0) {
3039                 log_error("Need to be root.");
3040                 goto finish;
3041         }
3042
3043         if (sd_booted() <= 0) {
3044                 log_error("Not running on a systemd system.");
3045                 goto finish;
3046         }
3047
3048         log_close();
3049         n_fd_passed = sd_listen_fds(false);
3050         if (n_fd_passed > 0) {
3051                 k = fdset_new_listen_fds(&fds, false);
3052                 if (k < 0) {
3053                         log_error_errno(-k, "Failed to collect file descriptors: %m");
3054                         goto finish;
3055                 }
3056         }
3057         fdset_close_others(fds);
3058         log_open();
3059
3060         if (arg_directory) {
3061                 if (path_equal(arg_directory, "/")) {
3062                         log_error("Spawning container on root directory not supported.");
3063                         goto finish;
3064                 }
3065
3066                 if (arg_boot) {
3067                         if (path_is_os_tree(arg_directory) <= 0) {
3068                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3069                                 goto finish;
3070                         }
3071                 } else {
3072                         const char *p;
3073
3074                         p = strappenda(arg_directory,
3075                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3076                         if (access(p, F_OK) < 0) {
3077                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3078                                 goto finish;
3079
3080                         }
3081                 }
3082         } else {
3083                 char template[] = "/tmp/nspawn-root-XXXXXX";
3084
3085                 if (!mkdtemp(template)) {
3086                         log_error("Failed to create temporary directory: %m");
3087                         r = -errno;
3088                         goto finish;
3089                 }
3090
3091                 arg_directory = strdup(template);
3092                 if (!arg_directory) {
3093                         r = log_oom();
3094                         goto finish;
3095                 }
3096
3097                 image_fd = setup_image(&device_path, &loop_nr);
3098                 if (image_fd < 0) {
3099                         r = image_fd;
3100                         goto finish;
3101                 }
3102
3103                 r = dissect_image(image_fd,
3104                                   &root_device, &root_device_rw,
3105                                   &home_device, &home_device_rw,
3106                                   &srv_device, &srv_device_rw,
3107                                   &secondary);
3108                 if (r < 0)
3109                         goto finish;
3110         }
3111
3112         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3113         if (master < 0) {
3114                 log_error("Failed to acquire pseudo tty: %m");
3115                 goto finish;
3116         }
3117
3118         console = ptsname(master);
3119         if (!console) {
3120                 log_error("Failed to determine tty name: %m");
3121                 goto finish;
3122         }
3123
3124         if (!arg_quiet)
3125                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3126                          arg_machine, arg_image ? arg_image : arg_directory);
3127
3128         if (unlockpt(master) < 0) {
3129                 log_error("Failed to unlock tty: %m");
3130                 goto finish;
3131         }
3132
3133         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3134                 log_error("Failed to create kmsg socket pair: %m");
3135                 goto finish;
3136         }
3137
3138         sd_notify(false,
3139                   "READY=1\n"
3140                   "STATUS=Container running.");
3141
3142         assert_se(sigemptyset(&mask) == 0);
3143         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3144         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3145
3146         assert_se(sigemptyset(&mask_chld) == 0);
3147         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3148
3149         for (;;) {
3150                 ContainerStatus container_status;
3151                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3152                 struct sigaction sa = {
3153                         .sa_handler = nop_handler,
3154                         .sa_flags = SA_NOCLDSTOP,
3155                 };
3156
3157                 r = barrier_create(&barrier);
3158                 if (r < 0) {
3159                         log_error_errno(-r, "Cannot initialize IPC barrier: %m");
3160                         goto finish;
3161                 }
3162
3163                 /* Child can be killed before execv(), so handle SIGCHLD
3164                  * in order to interrupt parent's blocking calls and
3165                  * give it a chance to call wait() and terminate. */
3166                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3167                 if (r < 0) {
3168                         log_error("Failed to change the signal mask: %m");
3169                         goto finish;
3170                 }
3171
3172                 r = sigaction(SIGCHLD, &sa, NULL);
3173                 if (r < 0) {
3174                         log_error("Failed to install SIGCHLD handler: %m");
3175                         goto finish;
3176                 }
3177
3178                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3179                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3180                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3181                 if (pid < 0) {
3182                         if (errno == EINVAL)
3183                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3184                         else
3185                                 log_error("clone() failed: %m");
3186
3187                         r = pid;
3188                         goto finish;
3189                 }
3190
3191                 if (pid == 0) {
3192                         /* child */
3193                         _cleanup_free_ char *home = NULL;
3194                         unsigned n_env = 2;
3195                         const char *envp[] = {
3196                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3197                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3198                                 NULL, /* TERM */
3199                                 NULL, /* HOME */
3200                                 NULL, /* USER */
3201                                 NULL, /* LOGNAME */
3202                                 NULL, /* container_uuid */
3203                                 NULL, /* LISTEN_FDS */
3204                                 NULL, /* LISTEN_PID */
3205                                 NULL
3206                         };
3207                         char **env_use;
3208
3209                         barrier_set_role(&barrier, BARRIER_CHILD);
3210
3211                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3212                         if (envp[n_env])
3213                                 n_env ++;
3214
3215                         master = safe_close(master);
3216
3217                         close_nointr(STDIN_FILENO);
3218                         close_nointr(STDOUT_FILENO);
3219                         close_nointr(STDERR_FILENO);
3220
3221                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3222
3223                         reset_all_signal_handlers();
3224                         reset_signal_mask();
3225
3226                         k = open_terminal(console, O_RDWR);
3227                         if (k != STDIN_FILENO) {
3228                                 if (k >= 0) {
3229                                         safe_close(k);
3230                                         k = -EINVAL;
3231                                 }
3232
3233                                 log_error_errno(-k, "Failed to open console: %m");
3234                                 _exit(EXIT_FAILURE);
3235                         }
3236
3237                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3238                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3239                                 log_error("Failed to duplicate console: %m");
3240                                 _exit(EXIT_FAILURE);
3241                         }
3242
3243                         if (setsid() < 0) {
3244                                 log_error("setsid() failed: %m");
3245                                 _exit(EXIT_FAILURE);
3246                         }
3247
3248                         if (reset_audit_loginuid() < 0)
3249                                 _exit(EXIT_FAILURE);
3250
3251                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3252                                 log_error("PR_SET_PDEATHSIG failed: %m");
3253                                 _exit(EXIT_FAILURE);
3254                         }
3255
3256                         /* Mark everything as slave, so that we still
3257                          * receive mounts from the real root, but don't
3258                          * propagate mounts to the real root. */
3259                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3260                                 log_error("MS_SLAVE|MS_REC failed: %m");
3261                                 _exit(EXIT_FAILURE);
3262                         }
3263
3264                         if (mount_devices(arg_directory,
3265                                           root_device, root_device_rw,
3266                                           home_device, home_device_rw,
3267                                           srv_device, srv_device_rw) < 0)
3268                                 _exit(EXIT_FAILURE);
3269
3270                         /* Turn directory into bind mount */
3271                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3272                                 log_error("Failed to make bind mount: %m");
3273                                 _exit(EXIT_FAILURE);
3274                         }
3275
3276                         r = setup_volatile(arg_directory);
3277                         if (r < 0)
3278                                 _exit(EXIT_FAILURE);
3279
3280                         if (setup_volatile_state(arg_directory) < 0)
3281                                 _exit(EXIT_FAILURE);
3282
3283                         r = base_filesystem_create(arg_directory);
3284                         if (r < 0)
3285                                 _exit(EXIT_FAILURE);
3286
3287                         if (arg_read_only) {
3288                                 k = bind_remount_recursive(arg_directory, true);
3289                                 if (k < 0) {
3290                                         log_error_errno(-k, "Failed to make tree read-only: %m");
3291                                         _exit(EXIT_FAILURE);
3292                                 }
3293                         }
3294
3295                         if (mount_all(arg_directory) < 0)
3296                                 _exit(EXIT_FAILURE);
3297
3298                         if (copy_devnodes(arg_directory) < 0)
3299                                 _exit(EXIT_FAILURE);
3300
3301                         if (setup_ptmx(arg_directory) < 0)
3302                                 _exit(EXIT_FAILURE);
3303
3304                         dev_setup(arg_directory);
3305
3306                         if (setup_seccomp() < 0)
3307                                 _exit(EXIT_FAILURE);
3308
3309                         if (setup_dev_console(arg_directory, console) < 0)
3310                                 _exit(EXIT_FAILURE);
3311
3312                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3313                                 _exit(EXIT_FAILURE);
3314
3315                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3316
3317                         if (setup_boot_id(arg_directory) < 0)
3318                                 _exit(EXIT_FAILURE);
3319
3320                         if (setup_timezone(arg_directory) < 0)
3321                                 _exit(EXIT_FAILURE);
3322
3323                         if (setup_resolv_conf(arg_directory) < 0)
3324                                 _exit(EXIT_FAILURE);
3325
3326                         if (setup_journal(arg_directory) < 0)
3327                                 _exit(EXIT_FAILURE);
3328
3329                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3330                                 _exit(EXIT_FAILURE);
3331
3332                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3333                                 _exit(EXIT_FAILURE);
3334
3335                         if (mount_tmpfs(arg_directory) < 0)
3336                                 _exit(EXIT_FAILURE);
3337
3338                         /* Tell the parent that we are ready, and that
3339                          * it can cgroupify us to that we lack access
3340                          * to certain devices and resources. */
3341                         (void)barrier_place(&barrier);
3342
3343                         if (chdir(arg_directory) < 0) {
3344                                 log_error("chdir(%s) failed: %m", arg_directory);
3345                                 _exit(EXIT_FAILURE);
3346                         }
3347
3348                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3349                                 log_error("mount(MS_MOVE) failed: %m");
3350                                 _exit(EXIT_FAILURE);
3351                         }
3352
3353                         if (chroot(".") < 0) {
3354                                 log_error("chroot() failed: %m");
3355                                 _exit(EXIT_FAILURE);
3356                         }
3357
3358                         if (chdir("/") < 0) {
3359                                 log_error("chdir() failed: %m");
3360                                 _exit(EXIT_FAILURE);
3361                         }
3362
3363                         umask(0022);
3364
3365                         if (arg_private_network)
3366                                 loopback_setup();
3367
3368                         if (drop_capabilities() < 0) {
3369                                 log_error("drop_capabilities() failed: %m");
3370                                 _exit(EXIT_FAILURE);
3371                         }
3372
3373                         r = change_uid_gid(&home);
3374                         if (r < 0)
3375                                 _exit(EXIT_FAILURE);
3376
3377                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3378                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3379                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3380                                 log_oom();
3381                                 _exit(EXIT_FAILURE);
3382                         }
3383
3384                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3385                                 char as_uuid[37];
3386
3387                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3388                                         log_oom();
3389                                         _exit(EXIT_FAILURE);
3390                                 }
3391                         }
3392
3393                         if (fdset_size(fds) > 0) {
3394                                 k = fdset_cloexec(fds, false);
3395                                 if (k < 0) {
3396                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3397                                         _exit(EXIT_FAILURE);
3398                                 }
3399
3400                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3401                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3402                                         log_oom();
3403                                         _exit(EXIT_FAILURE);
3404                                 }
3405                         }
3406
3407                         setup_hostname();
3408
3409                         if (arg_personality != 0xffffffffLU) {
3410                                 if (personality(arg_personality) < 0) {
3411                                         log_error("personality() failed: %m");
3412                                         _exit(EXIT_FAILURE);
3413                                 }
3414                         } else if (secondary) {
3415                                 if (personality(PER_LINUX32) < 0) {
3416                                         log_error("personality() failed: %m");
3417                                         _exit(EXIT_FAILURE);
3418                                 }
3419                         }
3420
3421 #ifdef HAVE_SELINUX
3422                         if (arg_selinux_context)
3423                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3424                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3425                                         _exit(EXIT_FAILURE);
3426                                 }
3427 #endif
3428
3429                         if (!strv_isempty(arg_setenv)) {
3430                                 char **n;
3431
3432                                 n = strv_env_merge(2, envp, arg_setenv);
3433                                 if (!n) {
3434                                         log_oom();
3435                                         _exit(EXIT_FAILURE);
3436                                 }
3437
3438                                 env_use = n;
3439                         } else
3440                                 env_use = (char**) envp;
3441
3442                         /* Wait until the parent is ready with the setup, too... */
3443                         if (!barrier_place_and_sync(&barrier))
3444                                 _exit(EXIT_FAILURE);
3445
3446                         if (arg_boot) {
3447                                 char **a;
3448                                 size_t l;
3449
3450                                 /* Automatically search for the init system */
3451
3452                                 l = 1 + argc - optind;
3453                                 a = newa(char*, l + 1);
3454                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3455
3456                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3457                                 execve(a[0], a, env_use);
3458
3459                                 a[0] = (char*) "/lib/systemd/systemd";
3460                                 execve(a[0], a, env_use);
3461
3462                                 a[0] = (char*) "/sbin/init";
3463                                 execve(a[0], a, env_use);
3464                         } else if (argc > optind)
3465                                 execvpe(argv[optind], argv + optind, env_use);
3466                         else {
3467                                 chdir(home ? home : "/root");
3468                                 execle("/bin/bash", "-bash", NULL, env_use);
3469                                 execle("/bin/sh", "-sh", NULL, env_use);
3470                         }
3471
3472                         log_error("execv() failed: %m");
3473                         _exit(EXIT_FAILURE);
3474                 }
3475
3476                 barrier_set_role(&barrier, BARRIER_PARENT);
3477                 fdset_free(fds);
3478                 fds = NULL;
3479
3480                 /* wait for child-setup to be done */
3481                 if (barrier_place_and_sync(&barrier)) {
3482                         _cleanup_event_unref_ sd_event *event = NULL;
3483                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3484                         int ifi = 0;
3485
3486                         r = move_network_interfaces(pid);
3487                         if (r < 0)
3488                                 goto finish;
3489
3490                         r = setup_veth(pid, veth_name, &ifi);
3491                         if (r < 0)
3492                                 goto finish;
3493
3494                         r = setup_bridge(veth_name, &ifi);
3495                         if (r < 0)
3496                                 goto finish;
3497
3498                         r = setup_macvlan(pid);
3499                         if (r < 0)
3500                                 goto finish;
3501
3502                         r = register_machine(pid, ifi);
3503                         if (r < 0)
3504                                 goto finish;
3505
3506                         /* Block SIGCHLD here, before notifying child.
3507                          * process_pty() will handle it with the other signals. */
3508                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3509                         if (r < 0)
3510                                 goto finish;
3511
3512                         /* Reset signal to default */
3513                         r = default_signals(SIGCHLD, -1);
3514                         if (r < 0)
3515                                 goto finish;
3516
3517                         /* Notify the child that the parent is ready with all
3518                          * its setup, and that the child can now hand over
3519                          * control to the code to run inside the container. */
3520                         (void)barrier_place(&barrier);
3521
3522                         r = sd_event_new(&event);
3523                         if (r < 0) {
3524                                 log_error_errno(-r, "Failed to get default event source: %m");
3525                                 goto finish;
3526                         }
3527
3528                         if (arg_boot) {
3529                                 /* Try to kill the init system on SIGINT or SIGTERM */
3530                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3531                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3532                         } else {
3533                                 /* Immediately exit */
3534                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3535                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3536                         }
3537
3538                         /* simply exit on sigchld */
3539                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3540
3541                         r = pty_forward_new(event, master, &forward);
3542                         if (r < 0) {
3543                                 log_error_errno(-r, "Failed to create PTY forwarder: %m");
3544                                 goto finish;
3545                         }
3546
3547                         r = sd_event_loop(event);
3548                         if (r < 0) {
3549                                 log_error_errno(-r, "Failed to run event loop: %m");
3550                                 return r;
3551                         }
3552
3553                         forward = pty_forward_free(forward);
3554
3555                         if (!arg_quiet)
3556                                 putc('\n', stdout);
3557
3558                         /* Kill if it is not dead yet anyway */
3559                         terminate_machine(pid);
3560                 }
3561
3562                 /* Normally redundant, but better safe than sorry */
3563                 kill(pid, SIGKILL);
3564
3565                 r = wait_for_container(pid, &container_status);
3566                 pid = 0;
3567
3568                 if (r < 0) {
3569                         /* We failed to wait for the container, or the
3570                          * container exited abnormally */
3571                         r = EXIT_FAILURE;
3572                         break;
3573                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3574                         /* The container exited with a non-zero
3575                          * status, or with zero status and no reboot
3576                          * was requested. */
3577                         break;
3578
3579                 /* CONTAINER_REBOOTED, loop again */
3580
3581                 if (arg_keep_unit) {
3582                         /* Special handling if we are running as a
3583                          * service: instead of simply restarting the
3584                          * machine we want to restart the entire
3585                          * service, so let's inform systemd about this
3586                          * with the special exit code 133. The service
3587                          * file uses RestartForceExitStatus=133 so
3588                          * that this results in a full nspawn
3589                          * restart. This is necessary since we might
3590                          * have cgroup parameters set we want to have
3591                          * flushed out. */
3592                         r = 133;
3593                         break;
3594                 }
3595         }
3596
3597 finish:
3598         sd_notify(false,
3599                   "STOPPING=1\n"
3600                   "STATUS=Terminating...");
3601
3602         loop_remove(loop_nr, &image_fd);
3603
3604         if (pid > 0)
3605                 kill(pid, SIGKILL);
3606
3607         free(arg_directory);
3608         free(arg_machine);
3609         free(arg_user);
3610         strv_free(arg_setenv);
3611         strv_free(arg_network_interfaces);
3612         strv_free(arg_network_macvlan);
3613         strv_free(arg_bind);
3614         strv_free(arg_bind_ro);
3615         strv_free(arg_tmpfs);
3616
3617         return r;
3618 }