chiark / gitweb /
34436b82a2879b3778bce44dcb27f59298740a11
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172                "  -h --help                 Show this help\n"
173                "     --version              Print version string\n"
174                "  -q --quiet                Do not show status information\n"
175                "  -D --directory=PATH       Root directory for the container\n"
176                "  -i --image=PATH           File system device or image for the container\n"
177                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
178                "  -u --user=USER            Run the command under specified user or uid\n"
179                "  -M --machine=NAME         Set the machine name for the container\n"
180                "     --uuid=UUID            Set a specific machine UUID for the container\n"
181                "  -S --slice=SLICE          Place the container in the specified slice\n"
182                "     --private-network      Disable network in container\n"
183                "     --network-interface=INTERFACE\n"
184                "                            Assign an existing network interface to the\n"
185                "                            container\n"
186                "     --network-macvlan=INTERFACE\n"
187                "                            Create a macvlan network interface based on an\n"
188                "                            existing network interface to the container\n"
189                "     --network-veth         Add a virtual ethernet connection between host\n"
190                "                            and container\n"
191                "     --network-bridge=INTERFACE\n"
192                "                            Add a virtual ethernet connection between host\n"
193                "                            and container and add it to an existing bridge on\n"
194                "                            the host\n"
195                "  -Z --selinux-context=SECLABEL\n"
196                "                            Set the SELinux security context to be used by\n"
197                "                            processes in the container\n"
198                "  -L --selinux-apifs-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            API/tmpfs file systems in the container\n"
201                "     --capability=CAP       In addition to the default, retain specified\n"
202                "                            capability\n"
203                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
204                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
205                "  -j                        Equivalent to --link-journal=host\n"
206                "     --read-only            Mount the root directory read-only\n"
207                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
208                "                            the container\n"
209                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
210                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
212                "     --share-system         Share system namespaces with host\n"
213                "     --register=BOOLEAN     Register container as machine\n"
214                "     --keep-unit            Do not register a scope for the machine, reuse\n"
215                "                            the service unit nspawn is running in\n"
216                "     --volatile[=MODE]      Run the system in volatile mode\n",
217                program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222         enum {
223                 ARG_VERSION = 0x100,
224                 ARG_PRIVATE_NETWORK,
225                 ARG_UUID,
226                 ARG_READ_ONLY,
227                 ARG_CAPABILITY,
228                 ARG_DROP_CAPABILITY,
229                 ARG_LINK_JOURNAL,
230                 ARG_BIND,
231                 ARG_BIND_RO,
232                 ARG_TMPFS,
233                 ARG_SETENV,
234                 ARG_SHARE_SYSTEM,
235                 ARG_REGISTER,
236                 ARG_KEEP_UNIT,
237                 ARG_NETWORK_INTERFACE,
238                 ARG_NETWORK_MACVLAN,
239                 ARG_NETWORK_VETH,
240                 ARG_NETWORK_BRIDGE,
241                 ARG_PERSONALITY,
242                 ARG_VOLATILE,
243         };
244
245         static const struct option options[] = {
246                 { "help",                  no_argument,       NULL, 'h'                   },
247                 { "version",               no_argument,       NULL, ARG_VERSION           },
248                 { "directory",             required_argument, NULL, 'D'                   },
249                 { "user",                  required_argument, NULL, 'u'                   },
250                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
251                 { "boot",                  no_argument,       NULL, 'b'                   },
252                 { "uuid",                  required_argument, NULL, ARG_UUID              },
253                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
254                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
255                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
256                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
257                 { "bind",                  required_argument, NULL, ARG_BIND              },
258                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
259                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
260                 { "machine",               required_argument, NULL, 'M'                   },
261                 { "slice",                 required_argument, NULL, 'S'                   },
262                 { "setenv",                required_argument, NULL, ARG_SETENV            },
263                 { "selinux-context",       required_argument, NULL, 'Z'                   },
264                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
265                 { "quiet",                 no_argument,       NULL, 'q'                   },
266                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
267                 { "register",              required_argument, NULL, ARG_REGISTER          },
268                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
269                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
270                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
271                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
272                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
273                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
274                 { "image",                 required_argument, NULL, 'i'                   },
275                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
276                 {}
277         };
278
279         int c, r;
280         uint64_t plus = 0, minus = 0;
281
282         assert(argc >= 0);
283         assert(argv);
284
285         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287                 switch (c) {
288
289                 case 'h':
290                         help();
291                         return 0;
292
293                 case ARG_VERSION:
294                         puts(PACKAGE_STRING);
295                         puts(SYSTEMD_FEATURES);
296                         return 0;
297
298                 case 'D':
299                         free(arg_directory);
300                         arg_directory = canonicalize_file_name(optarg);
301                         if (!arg_directory) {
302                                 log_error("Invalid root directory: %m");
303                                 return -ENOMEM;
304                         }
305
306                         break;
307
308                 case 'i':
309                         arg_image = optarg;
310                         break;
311
312                 case 'u':
313                         free(arg_user);
314                         arg_user = strdup(optarg);
315                         if (!arg_user)
316                                 return log_oom();
317
318                         break;
319
320                 case ARG_NETWORK_BRIDGE:
321                         arg_network_bridge = optarg;
322
323                         /* fall through */
324
325                 case ARG_NETWORK_VETH:
326                         arg_network_veth = true;
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_INTERFACE:
331                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
332                                 return log_oom();
333
334                         arg_private_network = true;
335                         break;
336
337                 case ARG_NETWORK_MACVLAN:
338                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
339                                 return log_oom();
340
341                         /* fall through */
342
343                 case ARG_PRIVATE_NETWORK:
344                         arg_private_network = true;
345                         break;
346
347                 case 'b':
348                         arg_boot = true;
349                         break;
350
351                 case ARG_UUID:
352                         r = sd_id128_from_string(optarg, &arg_uuid);
353                         if (r < 0) {
354                                 log_error("Invalid UUID: %s", optarg);
355                                 return r;
356                         }
357                         break;
358
359                 case 'S':
360                         arg_slice = optarg;
361                         break;
362
363                 case 'M':
364                         if (isempty(optarg)) {
365                                 free(arg_machine);
366                                 arg_machine = NULL;
367                         } else {
368
369                                 if (!hostname_is_valid(optarg)) {
370                                         log_error("Invalid machine name: %s", optarg);
371                                         return -EINVAL;
372                                 }
373
374                                 free(arg_machine);
375                                 arg_machine = strdup(optarg);
376                                 if (!arg_machine)
377                                         return log_oom();
378
379                                 break;
380                         }
381
382                 case 'Z':
383                         arg_selinux_context = optarg;
384                         break;
385
386                 case 'L':
387                         arg_selinux_apifs_context = optarg;
388                         break;
389
390                 case ARG_READ_ONLY:
391                         arg_read_only = true;
392                         break;
393
394                 case ARG_CAPABILITY:
395                 case ARG_DROP_CAPABILITY: {
396                         const char *state, *word;
397                         size_t length;
398
399                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400                                 _cleanup_free_ char *t;
401                                 cap_value_t cap;
402
403                                 t = strndup(word, length);
404                                 if (!t)
405                                         return log_oom();
406
407                                 if (streq(t, "all")) {
408                                         if (c == ARG_CAPABILITY)
409                                                 plus = (uint64_t) -1;
410                                         else
411                                                 minus = (uint64_t) -1;
412                                 } else {
413                                         if (cap_from_name(t, &cap) < 0) {
414                                                 log_error("Failed to parse capability %s.", t);
415                                                 return -EINVAL;
416                                         }
417
418                                         if (c == ARG_CAPABILITY)
419                                                 plus |= 1ULL << (uint64_t) cap;
420                                         else
421                                                 minus |= 1ULL << (uint64_t) cap;
422                                 }
423                         }
424
425                         break;
426                 }
427
428                 case 'j':
429                         arg_link_journal = LINK_GUEST;
430                         break;
431
432                 case ARG_LINK_JOURNAL:
433                         if (streq(optarg, "auto"))
434                                 arg_link_journal = LINK_AUTO;
435                         else if (streq(optarg, "no"))
436                                 arg_link_journal = LINK_NO;
437                         else if (streq(optarg, "guest"))
438                                 arg_link_journal = LINK_GUEST;
439                         else if (streq(optarg, "host"))
440                                 arg_link_journal = LINK_HOST;
441                         else {
442                                 log_error("Failed to parse link journal mode %s", optarg);
443                                 return -EINVAL;
444                         }
445
446                         break;
447
448                 case ARG_BIND:
449                 case ARG_BIND_RO: {
450                         _cleanup_free_ char *a = NULL, *b = NULL;
451                         char *e;
452                         char ***x;
453
454                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456                         e = strchr(optarg, ':');
457                         if (e) {
458                                 a = strndup(optarg, e - optarg);
459                                 b = strdup(e + 1);
460                         } else {
461                                 a = strdup(optarg);
462                                 b = strdup(optarg);
463                         }
464
465                         if (!a || !b)
466                                 return log_oom();
467
468                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
469                                 log_error("Invalid bind mount specification: %s", optarg);
470                                 return -EINVAL;
471                         }
472
473                         r = strv_extend(x, a);
474                         if (r < 0)
475                                 return log_oom();
476
477                         r = strv_extend(x, b);
478                         if (r < 0)
479                                 return log_oom();
480
481                         break;
482                 }
483
484                 case ARG_TMPFS: {
485                         _cleanup_free_ char *a = NULL, *b = NULL;
486                         char *e;
487
488                         e = strchr(optarg, ':');
489                         if (e) {
490                                 a = strndup(optarg, e - optarg);
491                                 b = strdup(e + 1);
492                         } else {
493                                 a = strdup(optarg);
494                                 b = strdup("mode=0755");
495                         }
496
497                         if (!a || !b)
498                                 return log_oom();
499
500                         if (!path_is_absolute(a)) {
501                                 log_error("Invalid tmpfs specification: %s", optarg);
502                                 return -EINVAL;
503                         }
504
505                         r = strv_push(&arg_tmpfs, a);
506                         if (r < 0)
507                                 return log_oom();
508
509                         a = NULL;
510
511                         r = strv_push(&arg_tmpfs, b);
512                         if (r < 0)
513                                 return log_oom();
514
515                         b = NULL;
516
517                         break;
518                 }
519
520                 case ARG_SETENV: {
521                         char **n;
522
523                         if (!env_assignment_is_valid(optarg)) {
524                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
525                                 return -EINVAL;
526                         }
527
528                         n = strv_env_set(arg_setenv, optarg);
529                         if (!n)
530                                 return log_oom();
531
532                         strv_free(arg_setenv);
533                         arg_setenv = n;
534                         break;
535                 }
536
537                 case 'q':
538                         arg_quiet = true;
539                         break;
540
541                 case ARG_SHARE_SYSTEM:
542                         arg_share_system = true;
543                         break;
544
545                 case ARG_REGISTER:
546                         r = parse_boolean(optarg);
547                         if (r < 0) {
548                                 log_error("Failed to parse --register= argument: %s", optarg);
549                                 return r;
550                         }
551
552                         arg_register = r;
553                         break;
554
555                 case ARG_KEEP_UNIT:
556                         arg_keep_unit = true;
557                         break;
558
559                 case ARG_PERSONALITY:
560
561                         arg_personality = personality_from_string(optarg);
562                         if (arg_personality == 0xffffffffLU) {
563                                 log_error("Unknown or unsupported personality '%s'.", optarg);
564                                 return -EINVAL;
565                         }
566
567                         break;
568
569                 case ARG_VOLATILE:
570
571                         if (!optarg)
572                                 arg_volatile = VOLATILE_YES;
573                         else {
574                                 r = parse_boolean(optarg);
575                                 if (r < 0) {
576                                         if (streq(optarg, "state"))
577                                                 arg_volatile = VOLATILE_STATE;
578                                         else {
579                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
580                                                 return r;
581                                         }
582                                 } else
583                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584                         }
585
586                         break;
587
588                 case '?':
589                         return -EINVAL;
590
591                 default:
592                         assert_not_reached("Unhandled option");
593                 }
594
595         if (arg_share_system)
596                 arg_register = false;
597
598         if (arg_boot && arg_share_system) {
599                 log_error("--boot and --share-system may not be combined.");
600                 return -EINVAL;
601         }
602
603         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604                 log_error("--keep-unit may not be used when invoked from a user session.");
605                 return -EINVAL;
606         }
607
608         if (arg_directory && arg_image) {
609                 log_error("--directory= and --image= may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_volatile != VOLATILE_NO && arg_read_only) {
614                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615                 return -EINVAL;
616         }
617
618         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620         return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625         typedef struct MountPoint {
626                 const char *what;
627                 const char *where;
628                 const char *type;
629                 const char *options;
630                 unsigned long flags;
631                 bool fatal;
632         } MountPoint;
633
634         static const MountPoint mount_table[] = {
635                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
636                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
637                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
638                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
639                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
640                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
642                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643 #ifdef HAVE_SELINUX
644                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
645                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
646 #endif
647         };
648
649         unsigned k;
650         int r = 0;
651
652         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653                 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655                 _cleanup_free_ char *options = NULL;
656 #endif
657                 const char *o;
658                 int t;
659
660                 where = strjoin(dest, "/", mount_table[k].where, NULL);
661                 if (!where)
662                         return log_oom();
663
664                 t = path_is_mount_point(where, true);
665                 if (t < 0) {
666                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668                         if (r == 0)
669                                 r = t;
670
671                         continue;
672                 }
673
674                 /* Skip this entry if it is not a remount. */
675                 if (mount_table[k].what && t > 0)
676                         continue;
677
678                 t = mkdir_p(where, 0755);
679                 if (t < 0) {
680                         if (mount_table[k].fatal) {
681                                log_error("Failed to create directory %s: %s", where, strerror(-t));
682
683                                 if (r == 0)
684                                         r = t;
685                         } else
686                                log_warning("Failed to create directory %s: %s", where, strerror(-t));
687
688                         continue;
689                 }
690
691 #ifdef HAVE_SELINUX
692                 if (arg_selinux_apifs_context &&
693                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
694                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
695                         if (!options)
696                                 return log_oom();
697
698                         o = options;
699                 } else
700 #endif
701                         o = mount_table[k].options;
702
703
704                 if (mount(mount_table[k].what,
705                           where,
706                           mount_table[k].type,
707                           mount_table[k].flags,
708                           o) < 0) {
709
710                         if (mount_table[k].fatal) {
711                                 log_error("mount(%s) failed: %m", where);
712
713                                 if (r == 0)
714                                         r = -errno;
715                         } else
716                                 log_warning("mount(%s) failed: %m", where);
717                 }
718         }
719
720         return r;
721 }
722
723 static int mount_binds(const char *dest, char **l, bool ro) {
724         char **x, **y;
725
726         STRV_FOREACH_PAIR(x, y, l) {
727                 _cleanup_free_ char *where = NULL;
728                 struct stat source_st, dest_st;
729                 int r;
730
731                 if (stat(*x, &source_st) < 0) {
732                         log_error("Failed to stat %s: %m", *x);
733                         return -errno;
734                 }
735
736                 where = strappend(dest, *y);
737                 if (!where)
738                         return log_oom();
739
740                 r = stat(where, &dest_st);
741                 if (r == 0) {
742                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
743                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
744                                 return -EINVAL;
745                         }
746                 } else if (errno == ENOENT) {
747                         r = mkdir_parents_label(where, 0755);
748                         if (r < 0) {
749                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
750                                 return r;
751                         }
752                 } else {
753                         log_error("Failed to bind mount %s: %m", *x);
754                         return -errno;
755                 }
756
757                 /* Create the mount point, but be conservative -- refuse to create block
758                  * and char devices. */
759                 if (S_ISDIR(source_st.st_mode)) {
760                         r = mkdir_label(where, 0755);
761                         if (r < 0) {
762                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
763
764                                 return r;
765                         }
766                 } else if (S_ISFIFO(source_st.st_mode)) {
767                         r = mkfifo(where, 0644);
768                         if (r < 0 && errno != EEXIST) {
769                                 log_error("Failed to create mount point %s: %m", where);
770
771                                 return -errno;
772                         }
773                 } else if (S_ISSOCK(source_st.st_mode)) {
774                         r = mknod(where, 0644 | S_IFSOCK, 0);
775                         if (r < 0 && errno != EEXIST) {
776                                 log_error("Failed to create mount point %s: %m", where);
777
778                                 return -errno;
779                         }
780                 } else if (S_ISREG(source_st.st_mode)) {
781                         r = touch(where);
782                         if (r < 0) {
783                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
784
785                                 return r;
786                         }
787                 } else {
788                         log_error("Refusing to create mountpoint for file: %s", *x);
789                         return -ENOTSUP;
790                 }
791
792                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
793                         log_error("mount(%s) failed: %m", where);
794                         return -errno;
795                 }
796
797                 if (ro) {
798                         r = bind_remount_recursive(where, true);
799                         if (r < 0) {
800                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
801                                 return r;
802                         }
803                 }
804         }
805
806         return 0;
807 }
808
809 static int mount_tmpfs(const char *dest) {
810         char **i, **o;
811
812         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813                 _cleanup_free_ char *where = NULL;
814                 int r;
815
816                 where = strappend(dest, *i);
817                 if (!where)
818                         return log_oom();
819
820                 r = mkdir_label(where, 0755);
821                 if (r < 0) {
822                         log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
823
824                         return r;
825                 }
826
827                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
828                         log_error("tmpfs mount to %s failed: %m", where);
829                         return -errno;
830                 }
831         }
832
833         return 0;
834 }
835
836 static int setup_timezone(const char *dest) {
837         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
838         char *z, *y;
839         int r;
840
841         assert(dest);
842
843         /* Fix the timezone, if possible */
844         r = readlink_malloc("/etc/localtime", &p);
845         if (r < 0) {
846                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
847                 return 0;
848         }
849
850         z = path_startswith(p, "../usr/share/zoneinfo/");
851         if (!z)
852                 z = path_startswith(p, "/usr/share/zoneinfo/");
853         if (!z) {
854                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
855                 return 0;
856         }
857
858         where = strappend(dest, "/etc/localtime");
859         if (!where)
860                 return log_oom();
861
862         r = readlink_malloc(where, &q);
863         if (r >= 0) {
864                 y = path_startswith(q, "../usr/share/zoneinfo/");
865                 if (!y)
866                         y = path_startswith(q, "/usr/share/zoneinfo/");
867
868                 /* Already pointing to the right place? Then do nothing .. */
869                 if (y && streq(y, z))
870                         return 0;
871         }
872
873         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
874         if (!check)
875                 return log_oom();
876
877         if (access(check, F_OK) < 0) {
878                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
879                 return 0;
880         }
881
882         what = strappend("../usr/share/zoneinfo/", z);
883         if (!what)
884                 return log_oom();
885
886         r = mkdir_parents(where, 0755);
887         if (r < 0) {
888                 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
889
890                 return 0;
891         }
892
893         r = unlink(where);
894         if (r < 0 && errno != ENOENT) {
895                 log_error("Failed to remove existing timezone info %s in container: %m", where);
896
897                 return 0;
898         }
899
900         if (symlink(what, where) < 0) {
901                 log_error("Failed to correct timezone of container: %m");
902                 return 0;
903         }
904
905         return 0;
906 }
907
908 static int setup_resolv_conf(const char *dest) {
909         _cleanup_free_ char *where = NULL;
910         int r;
911
912         assert(dest);
913
914         if (arg_private_network)
915                 return 0;
916
917         /* Fix resolv.conf, if possible */
918         where = strappend(dest, "/etc/resolv.conf");
919         if (!where)
920                 return log_oom();
921
922         /* We don't really care for the results of this really. If it
923          * fails, it fails, but meh... */
924         r = mkdir_parents(where, 0755);
925         if (r < 0) {
926                 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
927
928                 return 0;
929         }
930
931         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
932         if (r < 0) {
933                 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
934
935                 return 0;
936         }
937
938         return 0;
939 }
940
941 static int setup_volatile_state(const char *directory) {
942         const char *p;
943         int r;
944
945         assert(directory);
946
947         if (arg_volatile != VOLATILE_STATE)
948                 return 0;
949
950         /* --volatile=state means we simply overmount /var
951            with a tmpfs, and the rest read-only. */
952
953         r = bind_remount_recursive(directory, true);
954         if (r < 0) {
955                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
956                 return r;
957         }
958
959         p = strappenda(directory, "/var");
960         r = mkdir(p, 0755);
961         if (r < 0 && errno != EEXIST) {
962                 log_error("Failed to create %s: %m", directory);
963                 return -errno;
964         }
965
966         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
967                 log_error("Failed to mount tmpfs to /var: %m");
968                 return -errno;
969         }
970
971         return 0;
972 }
973
974 static int setup_volatile(const char *directory) {
975         bool tmpfs_mounted = false, bind_mounted = false;
976         char template[] = "/tmp/nspawn-volatile-XXXXXX";
977         const char *f, *t;
978         int r;
979
980         assert(directory);
981
982         if (arg_volatile != VOLATILE_YES)
983                 return 0;
984
985         /* --volatile=yes means we mount a tmpfs to the root dir, and
986            the original /usr to use inside it, and that read-only. */
987
988         if (!mkdtemp(template)) {
989                 log_error("Failed to create temporary directory: %m");
990                 return -errno;
991         }
992
993         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
994                 log_error("Failed to mount tmpfs for root directory: %m");
995                 r = -errno;
996                 goto fail;
997         }
998
999         tmpfs_mounted = true;
1000
1001         f = strappenda(directory, "/usr");
1002         t = strappenda(template, "/usr");
1003
1004         r = mkdir(t, 0755);
1005         if (r < 0 && errno != EEXIST) {
1006                 log_error("Failed to create %s: %m", t);
1007                 r = -errno;
1008                 goto fail;
1009         }
1010
1011         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1012                 log_error("Failed to create /usr bind mount: %m");
1013                 r = -errno;
1014                 goto fail;
1015         }
1016
1017         bind_mounted = true;
1018
1019         r = bind_remount_recursive(t, true);
1020         if (r < 0) {
1021                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1022                 goto fail;
1023         }
1024
1025         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1026                 log_error("Failed to move root mount: %m");
1027                 r = -errno;
1028                 goto fail;
1029         }
1030
1031         rmdir(template);
1032
1033         return 0;
1034
1035 fail:
1036         if (bind_mounted)
1037                 umount(t);
1038         if (tmpfs_mounted)
1039                 umount(template);
1040         rmdir(template);
1041         return r;
1042 }
1043
1044 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1045
1046         snprintf(s, 37,
1047                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1048                  SD_ID128_FORMAT_VAL(id));
1049
1050         return s;
1051 }
1052
1053 static int setup_boot_id(const char *dest) {
1054         _cleanup_free_ char *from = NULL, *to = NULL;
1055         sd_id128_t rnd = {};
1056         char as_uuid[37];
1057         int r;
1058
1059         assert(dest);
1060
1061         if (arg_share_system)
1062                 return 0;
1063
1064         /* Generate a new randomized boot ID, so that each boot-up of
1065          * the container gets a new one */
1066
1067         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1068         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1069         if (!from || !to)
1070                 return log_oom();
1071
1072         r = sd_id128_randomize(&rnd);
1073         if (r < 0) {
1074                 log_error("Failed to generate random boot id: %s", strerror(-r));
1075                 return r;
1076         }
1077
1078         id128_format_as_uuid(rnd, as_uuid);
1079
1080         r = write_string_file(from, as_uuid);
1081         if (r < 0) {
1082                 log_error("Failed to write boot id: %s", strerror(-r));
1083                 return r;
1084         }
1085
1086         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1087                 log_error("Failed to bind mount boot id: %m");
1088                 r = -errno;
1089         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1090                 log_warning("Failed to make boot id read-only: %m");
1091
1092         unlink(from);
1093         return r;
1094 }
1095
1096 static int copy_devnodes(const char *dest) {
1097
1098         static const char devnodes[] =
1099                 "null\0"
1100                 "zero\0"
1101                 "full\0"
1102                 "random\0"
1103                 "urandom\0"
1104                 "tty\0";
1105
1106         const char *d;
1107         int r = 0;
1108         _cleanup_umask_ mode_t u;
1109
1110         assert(dest);
1111
1112         u = umask(0000);
1113
1114         NULSTR_FOREACH(d, devnodes) {
1115                 _cleanup_free_ char *from = NULL, *to = NULL;
1116                 struct stat st;
1117
1118                 from = strappend("/dev/", d);
1119                 to = strjoin(dest, "/dev/", d, NULL);
1120                 if (!from || !to)
1121                         return log_oom();
1122
1123                 if (stat(from, &st) < 0) {
1124
1125                         if (errno != ENOENT) {
1126                                 log_error("Failed to stat %s: %m", from);
1127                                 return -errno;
1128                         }
1129
1130                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1131
1132                         log_error("%s is not a char or block device, cannot copy", from);
1133                         return -EIO;
1134
1135                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1136
1137                         log_error("mknod(%s) failed: %m", dest);
1138                         return  -errno;
1139                 }
1140         }
1141
1142         return r;
1143 }
1144
1145 static int setup_ptmx(const char *dest) {
1146         _cleanup_free_ char *p = NULL;
1147
1148         p = strappend(dest, "/dev/ptmx");
1149         if (!p)
1150                 return log_oom();
1151
1152         if (symlink("pts/ptmx", p) < 0) {
1153                 log_error("Failed to create /dev/ptmx symlink: %m");
1154                 return -errno;
1155         }
1156
1157         return 0;
1158 }
1159
1160 static int setup_dev_console(const char *dest, const char *console) {
1161         _cleanup_umask_ mode_t u;
1162         const char *to;
1163         struct stat st;
1164         int r;
1165
1166         assert(dest);
1167         assert(console);
1168
1169         u = umask(0000);
1170
1171         if (stat("/dev/null", &st) < 0) {
1172                 log_error("Failed to stat /dev/null: %m");
1173                 return -errno;
1174         }
1175
1176         r = chmod_and_chown(console, 0600, 0, 0);
1177         if (r < 0) {
1178                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1179                 return r;
1180         }
1181
1182         /* We need to bind mount the right tty to /dev/console since
1183          * ptys can only exist on pts file systems. To have something
1184          * to bind mount things on we create a device node first, and
1185          * use /dev/null for that since we the cgroups device policy
1186          * allows us to create that freely, while we cannot create
1187          * /dev/console. (Note that the major minor doesn't actually
1188          * matter here, since we mount it over anyway). */
1189
1190         to = strappenda(dest, "/dev/console");
1191         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1192                 log_error("mknod() for /dev/console failed: %m");
1193                 return -errno;
1194         }
1195
1196         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1197                 log_error("Bind mount for /dev/console failed: %m");
1198                 return -errno;
1199         }
1200
1201         return 0;
1202 }
1203
1204 static int setup_kmsg(const char *dest, int kmsg_socket) {
1205         _cleanup_free_ char *from = NULL, *to = NULL;
1206         int r, fd, k;
1207         _cleanup_umask_ mode_t u;
1208         union {
1209                 struct cmsghdr cmsghdr;
1210                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1211         } control = {};
1212         struct msghdr mh = {
1213                 .msg_control = &control,
1214                 .msg_controllen = sizeof(control),
1215         };
1216         struct cmsghdr *cmsg;
1217
1218         assert(dest);
1219         assert(kmsg_socket >= 0);
1220
1221         u = umask(0000);
1222
1223         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1224          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1225          * on the reading side behave very similar to /proc/kmsg,
1226          * their writing side behaves differently from /dev/kmsg in
1227          * that writing blocks when nothing is reading. In order to
1228          * avoid any problems with containers deadlocking due to this
1229          * we simply make /dev/kmsg unavailable to the container. */
1230         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1231             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1232                 return log_oom();
1233
1234         if (mkfifo(from, 0600) < 0) {
1235                 log_error("mkfifo() for /dev/kmsg failed: %m");
1236                 return -errno;
1237         }
1238
1239         r = chmod_and_chown(from, 0600, 0, 0);
1240         if (r < 0) {
1241                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1242                 return r;
1243         }
1244
1245         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1246                 log_error("Bind mount for /proc/kmsg failed: %m");
1247                 return -errno;
1248         }
1249
1250         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1251         if (fd < 0) {
1252                 log_error("Failed to open fifo: %m");
1253                 return -errno;
1254         }
1255
1256         cmsg = CMSG_FIRSTHDR(&mh);
1257         cmsg->cmsg_level = SOL_SOCKET;
1258         cmsg->cmsg_type = SCM_RIGHTS;
1259         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1260         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1261
1262         mh.msg_controllen = cmsg->cmsg_len;
1263
1264         /* Store away the fd in the socket, so that it stays open as
1265          * long as we run the child */
1266         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1267         safe_close(fd);
1268
1269         if (k < 0) {
1270                 log_error("Failed to send FIFO fd: %m");
1271                 return -errno;
1272         }
1273
1274         /* And now make the FIFO unavailable as /dev/kmsg... */
1275         unlink(from);
1276         return 0;
1277 }
1278
1279 static int setup_hostname(void) {
1280
1281         if (arg_share_system)
1282                 return 0;
1283
1284         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1285                 return -errno;
1286
1287         return 0;
1288 }
1289
1290 static int setup_journal(const char *directory) {
1291         sd_id128_t machine_id, this_id;
1292         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1293         char *id;
1294         int r;
1295
1296         p = strappend(directory, "/etc/machine-id");
1297         if (!p)
1298                 return log_oom();
1299
1300         r = read_one_line_file(p, &b);
1301         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1302                 return 0;
1303         else if (r < 0) {
1304                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1305                 return r;
1306         }
1307
1308         id = strstrip(b);
1309         if (isempty(id) && arg_link_journal == LINK_AUTO)
1310                 return 0;
1311
1312         /* Verify validity */
1313         r = sd_id128_from_string(id, &machine_id);
1314         if (r < 0) {
1315                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1316                 return r;
1317         }
1318
1319         r = sd_id128_get_machine(&this_id);
1320         if (r < 0) {
1321                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1322                 return r;
1323         }
1324
1325         if (sd_id128_equal(machine_id, this_id)) {
1326                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1327                          "Host and machine ids are equal (%s): refusing to link journals", id);
1328                 if (arg_link_journal == LINK_AUTO)
1329                         return 0;
1330                 return
1331                         -EEXIST;
1332         }
1333
1334         if (arg_link_journal == LINK_NO)
1335                 return 0;
1336
1337         free(p);
1338         p = strappend("/var/log/journal/", id);
1339         q = strjoin(directory, "/var/log/journal/", id, NULL);
1340         if (!p || !q)
1341                 return log_oom();
1342
1343         if (path_is_mount_point(p, false) > 0) {
1344                 if (arg_link_journal != LINK_AUTO) {
1345                         log_error("%s: already a mount point, refusing to use for journal", p);
1346                         return -EEXIST;
1347                 }
1348
1349                 return 0;
1350         }
1351
1352         if (path_is_mount_point(q, false) > 0) {
1353                 if (arg_link_journal != LINK_AUTO) {
1354                         log_error("%s: already a mount point, refusing to use for journal", q);
1355                         return -EEXIST;
1356                 }
1357
1358                 return 0;
1359         }
1360
1361         r = readlink_and_make_absolute(p, &d);
1362         if (r >= 0) {
1363                 if ((arg_link_journal == LINK_GUEST ||
1364                      arg_link_journal == LINK_AUTO) &&
1365                     path_equal(d, q)) {
1366
1367                         r = mkdir_p(q, 0755);
1368                         if (r < 0)
1369                                 log_warning("Failed to create directory %s: %m", q);
1370                         return 0;
1371                 }
1372
1373                 if (unlink(p) < 0) {
1374                         log_error("Failed to remove symlink %s: %m", p);
1375                         return -errno;
1376                 }
1377         } else if (r == -EINVAL) {
1378
1379                 if (arg_link_journal == LINK_GUEST &&
1380                     rmdir(p) < 0) {
1381
1382                         if (errno == ENOTDIR) {
1383                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1384                                 return r;
1385                         } else {
1386                                 log_error("Failed to remove %s: %m", p);
1387                                 return -errno;
1388                         }
1389                 }
1390         } else if (r != -ENOENT) {
1391                 log_error("readlink(%s) failed: %m", p);
1392                 return r;
1393         }
1394
1395         if (arg_link_journal == LINK_GUEST) {
1396
1397                 if (symlink(q, p) < 0) {
1398                         log_error("Failed to symlink %s to %s: %m", q, p);
1399                         return -errno;
1400                 }
1401
1402                 r = mkdir_p(q, 0755);
1403                 if (r < 0)
1404                         log_warning("Failed to create directory %s: %m", q);
1405                 return 0;
1406         }
1407
1408         if (arg_link_journal == LINK_HOST) {
1409                 r = mkdir_p(p, 0755);
1410                 if (r < 0) {
1411                         log_error("Failed to create %s: %m", p);
1412                         return r;
1413                 }
1414
1415         } else if (access(p, F_OK) < 0)
1416                 return 0;
1417
1418         if (dir_is_empty(q) == 0)
1419                 log_warning("%s is not empty, proceeding anyway.", q);
1420
1421         r = mkdir_p(q, 0755);
1422         if (r < 0) {
1423                 log_error("Failed to create %s: %m", q);
1424                 return r;
1425         }
1426
1427         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1428                 log_error("Failed to bind mount journal from host into guest: %m");
1429                 return -errno;
1430         }
1431
1432         return 0;
1433 }
1434
1435 static int setup_kdbus(const char *dest, const char *path) {
1436         const char *p;
1437
1438         if (!path)
1439                 return 0;
1440
1441         p = strappenda(dest, "/dev/kdbus");
1442         if (mkdir(p, 0755) < 0) {
1443                 log_error("Failed to create kdbus path: %m");
1444                 return  -errno;
1445         }
1446
1447         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1448                 log_error("Failed to mount kdbus domain path: %m");
1449                 return -errno;
1450         }
1451
1452         return 0;
1453 }
1454
1455 static int drop_capabilities(void) {
1456         return capability_bounding_set_drop(~arg_retain, false);
1457 }
1458
1459 static int register_machine(pid_t pid, int local_ifindex) {
1460         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1461         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1462         int r;
1463
1464         if (!arg_register)
1465                 return 0;
1466
1467         r = sd_bus_default_system(&bus);
1468         if (r < 0) {
1469                 log_error("Failed to open system bus: %s", strerror(-r));
1470                 return r;
1471         }
1472
1473         if (arg_keep_unit) {
1474                 r = sd_bus_call_method(
1475                                 bus,
1476                                 "org.freedesktop.machine1",
1477                                 "/org/freedesktop/machine1",
1478                                 "org.freedesktop.machine1.Manager",
1479                                 "RegisterMachineWithNetwork",
1480                                 &error,
1481                                 NULL,
1482                                 "sayssusai",
1483                                 arg_machine,
1484                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1485                                 "nspawn",
1486                                 "container",
1487                                 (uint32_t) pid,
1488                                 strempty(arg_directory),
1489                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1490         } else {
1491                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1492
1493                 r = sd_bus_message_new_method_call(
1494                                 bus,
1495                                 &m,
1496                                 "org.freedesktop.machine1",
1497                                 "/org/freedesktop/machine1",
1498                                 "org.freedesktop.machine1.Manager",
1499                                 "CreateMachineWithNetwork");
1500                 if (r < 0) {
1501                         log_error("Failed to create message: %s", strerror(-r));
1502                         return r;
1503                 }
1504
1505                 r = sd_bus_message_append(
1506                                 m,
1507                                 "sayssusai",
1508                                 arg_machine,
1509                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1510                                 "nspawn",
1511                                 "container",
1512                                 (uint32_t) pid,
1513                                 strempty(arg_directory),
1514                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1515                 if (r < 0) {
1516                         log_error("Failed to append message arguments: %s", strerror(-r));
1517                         return r;
1518                 }
1519
1520                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1521                 if (r < 0) {
1522                         log_error("Failed to open container: %s", strerror(-r));
1523                         return r;
1524                 }
1525
1526                 if (!isempty(arg_slice)) {
1527                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1528                         if (r < 0) {
1529                                 log_error("Failed to append slice: %s", strerror(-r));
1530                                 return r;
1531                         }
1532                 }
1533
1534                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1535                 if (r < 0) {
1536                         log_error("Failed to add device policy: %s", strerror(-r));
1537                         return r;
1538                 }
1539
1540                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1541                                           /* Allow the container to
1542                                            * access and create the API
1543                                            * device nodes, so that
1544                                            * PrivateDevices= in the
1545                                            * container can work
1546                                            * fine */
1547                                           "/dev/null", "rwm",
1548                                           "/dev/zero", "rwm",
1549                                           "/dev/full", "rwm",
1550                                           "/dev/random", "rwm",
1551                                           "/dev/urandom", "rwm",
1552                                           "/dev/tty", "rwm",
1553                                           /* Allow the container
1554                                            * access to ptys. However,
1555                                            * do not permit the
1556                                            * container to ever create
1557                                            * these device nodes. */
1558                                           "/dev/pts/ptmx", "rw",
1559                                           "char-pts", "rw",
1560                                           /* Allow the container
1561                                            * access to all kdbus
1562                                            * devices. Again, the
1563                                            * container cannot create
1564                                            * these nodes, only use
1565                                            * them. We use a pretty
1566                                            * open match here, so that
1567                                            * the kernel API can still
1568                                            * change. */
1569                                           "char-kdbus", "rw",
1570                                           "char-kdbus/*", "rw");
1571                 if (r < 0) {
1572                         log_error("Failed to add device whitelist: %s", strerror(-r));
1573                         return r;
1574                 }
1575
1576                 r = sd_bus_message_close_container(m);
1577                 if (r < 0) {
1578                         log_error("Failed to close container: %s", strerror(-r));
1579                         return r;
1580                 }
1581
1582                 r = sd_bus_call(bus, m, 0, &error, NULL);
1583         }
1584
1585         if (r < 0) {
1586                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1587                 return r;
1588         }
1589
1590         return 0;
1591 }
1592
1593 static int terminate_machine(pid_t pid) {
1594         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1595         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1596         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1597         const char *path;
1598         int r;
1599
1600         if (!arg_register)
1601                 return 0;
1602
1603         r = sd_bus_default_system(&bus);
1604         if (r < 0) {
1605                 log_error("Failed to open system bus: %s", strerror(-r));
1606                 return r;
1607         }
1608
1609         r = sd_bus_call_method(
1610                         bus,
1611                         "org.freedesktop.machine1",
1612                         "/org/freedesktop/machine1",
1613                         "org.freedesktop.machine1.Manager",
1614                         "GetMachineByPID",
1615                         &error,
1616                         &reply,
1617                         "u",
1618                         (uint32_t) pid);
1619         if (r < 0) {
1620                 /* Note that the machine might already have been
1621                  * cleaned up automatically, hence don't consider it a
1622                  * failure if we cannot get the machine object. */
1623                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1624                 return 0;
1625         }
1626
1627         r = sd_bus_message_read(reply, "o", &path);
1628         if (r < 0)
1629                 return bus_log_parse_error(r);
1630
1631         r = sd_bus_call_method(
1632                         bus,
1633                         "org.freedesktop.machine1",
1634                         path,
1635                         "org.freedesktop.machine1.Machine",
1636                         "Terminate",
1637                         &error,
1638                         NULL,
1639                         NULL);
1640         if (r < 0) {
1641                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1642                 return 0;
1643         }
1644
1645         return 0;
1646 }
1647
1648 static int reset_audit_loginuid(void) {
1649         _cleanup_free_ char *p = NULL;
1650         int r;
1651
1652         if (arg_share_system)
1653                 return 0;
1654
1655         r = read_one_line_file("/proc/self/loginuid", &p);
1656         if (r == -ENOENT)
1657                 return 0;
1658         if (r < 0) {
1659                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1660                 return r;
1661         }
1662
1663         /* Already reset? */
1664         if (streq(p, "4294967295"))
1665                 return 0;
1666
1667         r = write_string_file("/proc/self/loginuid", "4294967295");
1668         if (r < 0) {
1669                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1670                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1671                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1672                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1673                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1674
1675                 sleep(5);
1676         }
1677
1678         return 0;
1679 }
1680
1681 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1682 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1683
1684 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1685         int r;
1686
1687         uint8_t result[8];
1688         size_t l, sz;
1689         uint8_t *v;
1690
1691         l = strlen(arg_machine);
1692         sz = sizeof(sd_id128_t) + l;
1693         v = alloca(sz);
1694
1695         /* fetch some persistent data unique to the host */
1696         r = sd_id128_get_machine((sd_id128_t*) v);
1697         if (r < 0)
1698                 return r;
1699
1700         /* combine with some data unique (on this host) to this
1701          * container instance */
1702         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1703
1704         /* Let's hash the host machine ID plus the container name. We
1705          * use a fixed, but originally randomly created hash key here. */
1706         siphash24(result, v, sz, hash_key.bytes);
1707
1708         assert_cc(ETH_ALEN <= sizeof(result));
1709         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1710
1711         /* see eth_random_addr in the kernel */
1712         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1713         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1714
1715         return 0;
1716 }
1717
1718 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1719         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1720         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1721         struct ether_addr mac_host, mac_container;
1722         int r, i;
1723
1724         if (!arg_private_network)
1725                 return 0;
1726
1727         if (!arg_network_veth)
1728                 return 0;
1729
1730         /* Use two different interface name prefixes depending whether
1731          * we are in bridge mode or not. */
1732         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1733                  arg_network_bridge ? "vb" : "ve", arg_machine);
1734
1735         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1736         if (r < 0) {
1737                 log_error("Failed to generate predictable MAC address for container side");
1738                 return r;
1739         }
1740
1741         r = generate_mac(&mac_host, HOST_HASH_KEY);
1742         if (r < 0) {
1743                 log_error("Failed to generate predictable MAC address for host side");
1744                 return r;
1745         }
1746
1747         r = sd_rtnl_open(&rtnl, 0);
1748         if (r < 0) {
1749                 log_error("Failed to connect to netlink: %s", strerror(-r));
1750                 return r;
1751         }
1752
1753         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1754         if (r < 0) {
1755                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1756                 return r;
1757         }
1758
1759         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1760         if (r < 0) {
1761                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1762                 return r;
1763         }
1764
1765         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1766         if (r < 0) {
1767                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1768                 return r;
1769         }
1770
1771         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1772         if (r < 0) {
1773                 log_error("Failed to open netlink container: %s", strerror(-r));
1774                 return r;
1775         }
1776
1777         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1778         if (r < 0) {
1779                 log_error("Failed to open netlink container: %s", strerror(-r));
1780                 return r;
1781         }
1782
1783         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1784         if (r < 0) {
1785                 log_error("Failed to open netlink container: %s", strerror(-r));
1786                 return r;
1787         }
1788
1789         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1790         if (r < 0) {
1791                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1792                 return r;
1793         }
1794
1795         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1796         if (r < 0) {
1797                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1798                 return r;
1799         }
1800
1801         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1802         if (r < 0) {
1803                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1804                 return r;
1805         }
1806
1807         r = sd_rtnl_message_close_container(m);
1808         if (r < 0) {
1809                 log_error("Failed to close netlink container: %s", strerror(-r));
1810                 return r;
1811         }
1812
1813         r = sd_rtnl_message_close_container(m);
1814         if (r < 0) {
1815                 log_error("Failed to close netlink container: %s", strerror(-r));
1816                 return r;
1817         }
1818
1819         r = sd_rtnl_message_close_container(m);
1820         if (r < 0) {
1821                 log_error("Failed to close netlink container: %s", strerror(-r));
1822                 return r;
1823         }
1824
1825         r = sd_rtnl_call(rtnl, m, 0, NULL);
1826         if (r < 0) {
1827                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1828                 return r;
1829         }
1830
1831         i = (int) if_nametoindex(iface_name);
1832         if (i <= 0) {
1833                 log_error("Failed to resolve interface %s: %m", iface_name);
1834                 return -errno;
1835         }
1836
1837         *ifi = i;
1838
1839         return 0;
1840 }
1841
1842 static int setup_bridge(const char veth_name[], int *ifi) {
1843         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1844         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1845         int r, bridge;
1846
1847         if (!arg_private_network)
1848                 return 0;
1849
1850         if (!arg_network_veth)
1851                 return 0;
1852
1853         if (!arg_network_bridge)
1854                 return 0;
1855
1856         bridge = (int) if_nametoindex(arg_network_bridge);
1857         if (bridge <= 0) {
1858                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1859                 return -errno;
1860         }
1861
1862         *ifi = bridge;
1863
1864         r = sd_rtnl_open(&rtnl, 0);
1865         if (r < 0) {
1866                 log_error("Failed to connect to netlink: %s", strerror(-r));
1867                 return r;
1868         }
1869
1870         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1871         if (r < 0) {
1872                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1873                 return r;
1874         }
1875
1876         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1877         if (r < 0) {
1878                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1879                 return r;
1880         }
1881
1882         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1883         if (r < 0) {
1884                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1885                 return r;
1886         }
1887
1888         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1889         if (r < 0) {
1890                 log_error("Failed to add netlink master field: %s", strerror(-r));
1891                 return r;
1892         }
1893
1894         r = sd_rtnl_call(rtnl, m, 0, NULL);
1895         if (r < 0) {
1896                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1897                 return r;
1898         }
1899
1900         return 0;
1901 }
1902
1903 static int parse_interface(struct udev *udev, const char *name) {
1904         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1905         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1906         int ifi;
1907
1908         ifi = (int) if_nametoindex(name);
1909         if (ifi <= 0) {
1910                 log_error("Failed to resolve interface %s: %m", name);
1911                 return -errno;
1912         }
1913
1914         sprintf(ifi_str, "n%i", ifi);
1915         d = udev_device_new_from_device_id(udev, ifi_str);
1916         if (!d) {
1917                 log_error("Failed to get udev device for interface %s: %m", name);
1918                 return -errno;
1919         }
1920
1921         if (udev_device_get_is_initialized(d) <= 0) {
1922                 log_error("Network interface %s is not initialized yet.", name);
1923                 return -EBUSY;
1924         }
1925
1926         return ifi;
1927 }
1928
1929 static int move_network_interfaces(pid_t pid) {
1930         _cleanup_udev_unref_ struct udev *udev = NULL;
1931         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1932         char **i;
1933         int r;
1934
1935         if (!arg_private_network)
1936                 return 0;
1937
1938         if (strv_isempty(arg_network_interfaces))
1939                 return 0;
1940
1941         r = sd_rtnl_open(&rtnl, 0);
1942         if (r < 0) {
1943                 log_error("Failed to connect to netlink: %s", strerror(-r));
1944                 return r;
1945         }
1946
1947         udev = udev_new();
1948         if (!udev) {
1949                 log_error("Failed to connect to udev.");
1950                 return -ENOMEM;
1951         }
1952
1953         STRV_FOREACH(i, arg_network_interfaces) {
1954                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1955                 int ifi;
1956
1957                 ifi = parse_interface(udev, *i);
1958                 if (ifi < 0)
1959                         return ifi;
1960
1961                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1962                 if (r < 0) {
1963                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1964                         return r;
1965                 }
1966
1967                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1968                 if (r < 0) {
1969                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1970                         return r;
1971                 }
1972
1973                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1974                 if (r < 0) {
1975                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1976                         return r;
1977                 }
1978         }
1979
1980         return 0;
1981 }
1982
1983 static int setup_macvlan(pid_t pid) {
1984         _cleanup_udev_unref_ struct udev *udev = NULL;
1985         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1986         char **i;
1987         int r;
1988
1989         if (!arg_private_network)
1990                 return 0;
1991
1992         if (strv_isempty(arg_network_macvlan))
1993                 return 0;
1994
1995         r = sd_rtnl_open(&rtnl, 0);
1996         if (r < 0) {
1997                 log_error("Failed to connect to netlink: %s", strerror(-r));
1998                 return r;
1999         }
2000
2001         udev = udev_new();
2002         if (!udev) {
2003                 log_error("Failed to connect to udev.");
2004                 return -ENOMEM;
2005         }
2006
2007         STRV_FOREACH(i, arg_network_macvlan) {
2008                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2009                 _cleanup_free_ char *n = NULL;
2010                 int ifi;
2011
2012                 ifi = parse_interface(udev, *i);
2013                 if (ifi < 0)
2014                         return ifi;
2015
2016                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2017                 if (r < 0) {
2018                         log_error("Failed to allocate netlink message: %s", strerror(-r));
2019                         return r;
2020                 }
2021
2022                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2023                 if (r < 0) {
2024                         log_error("Failed to add netlink interface index: %s", strerror(-r));
2025                         return r;
2026                 }
2027
2028                 n = strappend("mv-", *i);
2029                 if (!n)
2030                         return log_oom();
2031
2032                 strshorten(n, IFNAMSIZ-1);
2033
2034                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2035                 if (r < 0) {
2036                         log_error("Failed to add netlink interface name: %s", strerror(-r));
2037                         return r;
2038                 }
2039
2040                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2041                 if (r < 0) {
2042                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
2043                         return r;
2044                 }
2045
2046                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2047                 if (r < 0) {
2048                         log_error("Failed to open netlink container: %s", strerror(-r));
2049                         return r;
2050                 }
2051
2052                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2053                 if (r < 0) {
2054                         log_error("Failed to open netlink container: %s", strerror(-r));
2055                         return r;
2056                 }
2057
2058                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2059                 if (r < 0) {
2060                         log_error("Failed to append macvlan mode: %s", strerror(-r));
2061                         return r;
2062                 }
2063
2064                 r = sd_rtnl_message_close_container(m);
2065                 if (r < 0) {
2066                         log_error("Failed to close netlink container: %s", strerror(-r));
2067                         return r;
2068                 }
2069
2070                 r = sd_rtnl_message_close_container(m);
2071                 if (r < 0) {
2072                         log_error("Failed to close netlink container: %s", strerror(-r));
2073                         return r;
2074                 }
2075
2076                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2077                 if (r < 0) {
2078                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2079                         return r;
2080                 }
2081         }
2082
2083         return 0;
2084 }
2085
2086 static int setup_seccomp(void) {
2087
2088 #ifdef HAVE_SECCOMP
2089         static const int blacklist[] = {
2090                 SCMP_SYS(kexec_load),
2091                 SCMP_SYS(open_by_handle_at),
2092                 SCMP_SYS(init_module),
2093                 SCMP_SYS(finit_module),
2094                 SCMP_SYS(delete_module),
2095                 SCMP_SYS(iopl),
2096                 SCMP_SYS(ioperm),
2097                 SCMP_SYS(swapon),
2098                 SCMP_SYS(swapoff),
2099         };
2100
2101         scmp_filter_ctx seccomp;
2102         unsigned i;
2103         int r;
2104
2105         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2106         if (!seccomp)
2107                 return log_oom();
2108
2109         r = seccomp_add_secondary_archs(seccomp);
2110         if (r < 0) {
2111                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2112                 goto finish;
2113         }
2114
2115         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2116                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2117                 if (r == -EFAULT)
2118                         continue; /* unknown syscall */
2119                 if (r < 0) {
2120                         log_error("Failed to block syscall: %s", strerror(-r));
2121                         goto finish;
2122                 }
2123         }
2124
2125         /*
2126            Audit is broken in containers, much of the userspace audit
2127            hookup will fail if running inside a container. We don't
2128            care and just turn off creation of audit sockets.
2129
2130            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2131            with EAFNOSUPPORT which audit userspace uses as indication
2132            that audit is disabled in the kernel.
2133          */
2134
2135         r = seccomp_rule_add(
2136                         seccomp,
2137                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2138                         SCMP_SYS(socket),
2139                         2,
2140                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2141                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2142         if (r < 0) {
2143                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2144                 goto finish;
2145         }
2146
2147         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2148         if (r < 0) {
2149                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2150                 goto finish;
2151         }
2152
2153         r = seccomp_load(seccomp);
2154         if (r < 0)
2155                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2156
2157 finish:
2158         seccomp_release(seccomp);
2159         return r;
2160 #else
2161         return 0;
2162 #endif
2163
2164 }
2165
2166 static int setup_image(char **device_path, int *loop_nr) {
2167         struct loop_info64 info = {
2168                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2169         };
2170         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2171         _cleanup_free_ char* loopdev = NULL;
2172         struct stat st;
2173         int r, nr;
2174
2175         assert(device_path);
2176         assert(loop_nr);
2177
2178         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2179         if (fd < 0) {
2180                 log_error("Failed to open %s: %m", arg_image);
2181                 return -errno;
2182         }
2183
2184         if (fstat(fd, &st) < 0) {
2185                 log_error("Failed to stat %s: %m", arg_image);
2186                 return -errno;
2187         }
2188
2189         if (S_ISBLK(st.st_mode)) {
2190                 char *p;
2191
2192                 p = strdup(arg_image);
2193                 if (!p)
2194                         return log_oom();
2195
2196                 *device_path = p;
2197
2198                 *loop_nr = -1;
2199
2200                 r = fd;
2201                 fd = -1;
2202
2203                 return r;
2204         }
2205
2206         if (!S_ISREG(st.st_mode)) {
2207                 log_error("%s is not a regular file or block device: %m", arg_image);
2208                 return -EINVAL;
2209         }
2210
2211         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2212         if (control < 0) {
2213                 log_error("Failed to open /dev/loop-control: %m");
2214                 return -errno;
2215         }
2216
2217         nr = ioctl(control, LOOP_CTL_GET_FREE);
2218         if (nr < 0) {
2219                 log_error("Failed to allocate loop device: %m");
2220                 return -errno;
2221         }
2222
2223         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2224                 return log_oom();
2225
2226         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2227         if (loop < 0) {
2228                 log_error("Failed to open loop device %s: %m", loopdev);
2229                 return -errno;
2230         }
2231
2232         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2233                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2234                 return -errno;
2235         }
2236
2237         if (arg_read_only)
2238                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2239
2240         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2241                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2242                 return -errno;
2243         }
2244
2245         *device_path = loopdev;
2246         loopdev = NULL;
2247
2248         *loop_nr = nr;
2249
2250         r = loop;
2251         loop = -1;
2252
2253         return r;
2254 }
2255
2256 static int dissect_image(
2257                 int fd,
2258                 char **root_device, bool *root_device_rw,
2259                 char **home_device, bool *home_device_rw,
2260                 char **srv_device, bool *srv_device_rw,
2261                 bool *secondary) {
2262
2263 #ifdef HAVE_BLKID
2264         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2265         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2266         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2267         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2268         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2269         _cleanup_udev_unref_ struct udev *udev = NULL;
2270         struct udev_list_entry *first, *item;
2271         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2272         const char *pttype = NULL;
2273         blkid_partlist pl;
2274         struct stat st;
2275         int r;
2276
2277         assert(fd >= 0);
2278         assert(root_device);
2279         assert(home_device);
2280         assert(srv_device);
2281         assert(secondary);
2282
2283         b = blkid_new_probe();
2284         if (!b)
2285                 return log_oom();
2286
2287         errno = 0;
2288         r = blkid_probe_set_device(b, fd, 0, 0);
2289         if (r != 0) {
2290                 if (errno == 0)
2291                         return log_oom();
2292
2293                 log_error("Failed to set device on blkid probe: %m");
2294                 return -errno;
2295         }
2296
2297         blkid_probe_enable_partitions(b, 1);
2298         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2299
2300         errno = 0;
2301         r = blkid_do_safeprobe(b);
2302         if (r == -2 || r == 1) {
2303                 log_error("Failed to identify any partition table on %s.\n"
2304                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2305                 return -EINVAL;
2306         } else if (r != 0) {
2307                 if (errno == 0)
2308                         errno = EIO;
2309                 log_error("Failed to probe: %m");
2310                 return -errno;
2311         }
2312
2313         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2314         if (!streq_ptr(pttype, "gpt")) {
2315                 log_error("Image %s does not carry a GUID Partition Table.\n"
2316                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2317                 return -EINVAL;
2318         }
2319
2320         errno = 0;
2321         pl = blkid_probe_get_partitions(b);
2322         if (!pl) {
2323                 if (errno == 0)
2324                         return log_oom();
2325
2326                 log_error("Failed to list partitions of %s", arg_image);
2327                 return -errno;
2328         }
2329
2330         udev = udev_new();
2331         if (!udev)
2332                 return log_oom();
2333
2334         if (fstat(fd, &st) < 0) {
2335                 log_error("Failed to stat block device: %m");
2336                 return -errno;
2337         }
2338
2339         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2340         if (!d)
2341                 return log_oom();
2342
2343         e = udev_enumerate_new(udev);
2344         if (!e)
2345                 return log_oom();
2346
2347         r = udev_enumerate_add_match_parent(e, d);
2348         if (r < 0)
2349                 return log_oom();
2350
2351         r = udev_enumerate_scan_devices(e);
2352         if (r < 0) {
2353                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2354                 return r;
2355         }
2356
2357         first = udev_enumerate_get_list_entry(e);
2358         udev_list_entry_foreach(item, first) {
2359                 _cleanup_udev_device_unref_ struct udev_device *q;
2360                 const char *stype, *node;
2361                 unsigned long long flags;
2362                 sd_id128_t type_id;
2363                 blkid_partition pp;
2364                 dev_t qn;
2365                 int nr;
2366
2367                 errno = 0;
2368                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2369                 if (!q) {
2370                         if (!errno)
2371                                 errno = ENOMEM;
2372
2373                         log_error("Failed to get partition device of %s: %m", arg_image);
2374                         return -errno;
2375                 }
2376
2377                 qn = udev_device_get_devnum(q);
2378                 if (major(qn) == 0)
2379                         continue;
2380
2381                 if (st.st_rdev == qn)
2382                         continue;
2383
2384                 node = udev_device_get_devnode(q);
2385                 if (!node)
2386                         continue;
2387
2388                 pp = blkid_partlist_devno_to_partition(pl, qn);
2389                 if (!pp)
2390                         continue;
2391
2392                 flags = blkid_partition_get_flags(pp);
2393                 if (flags & GPT_FLAG_NO_AUTO)
2394                         continue;
2395
2396                 nr = blkid_partition_get_partno(pp);
2397                 if (nr < 0)
2398                         continue;
2399
2400                 stype = blkid_partition_get_type_string(pp);
2401                 if (!stype)
2402                         continue;
2403
2404                 if (sd_id128_from_string(stype, &type_id) < 0)
2405                         continue;
2406
2407                 if (sd_id128_equal(type_id, GPT_HOME)) {
2408
2409                         if (home && nr >= home_nr)
2410                                 continue;
2411
2412                         home_nr = nr;
2413                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2414
2415                         free(home);
2416                         home = strdup(node);
2417                         if (!home)
2418                                 return log_oom();
2419                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2420
2421                         if (srv && nr >= srv_nr)
2422                                 continue;
2423
2424                         srv_nr = nr;
2425                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2426
2427                         free(srv);
2428                         srv = strdup(node);
2429                         if (!srv)
2430                                 return log_oom();
2431                 }
2432 #ifdef GPT_ROOT_NATIVE
2433                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2434
2435                         if (root && nr >= root_nr)
2436                                 continue;
2437
2438                         root_nr = nr;
2439                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2440
2441                         free(root);
2442                         root = strdup(node);
2443                         if (!root)
2444                                 return log_oom();
2445                 }
2446 #endif
2447 #ifdef GPT_ROOT_SECONDARY
2448                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2449
2450                         if (secondary_root && nr >= secondary_root_nr)
2451                                 continue;
2452
2453                         secondary_root_nr = nr;
2454                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2455
2456
2457                         free(secondary_root);
2458                         secondary_root = strdup(node);
2459                         if (!secondary_root)
2460                                 return log_oom();
2461                 }
2462 #endif
2463         }
2464
2465         if (!root && !secondary_root) {
2466                 log_error("Failed to identify root partition in disk image %s.\n"
2467                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2468                 return -EINVAL;
2469         }
2470
2471         if (root) {
2472                 *root_device = root;
2473                 root = NULL;
2474
2475                 *root_device_rw = root_rw;
2476                 *secondary = false;
2477         } else if (secondary_root) {
2478                 *root_device = secondary_root;
2479                 secondary_root = NULL;
2480
2481                 *root_device_rw = secondary_root_rw;
2482                 *secondary = true;
2483         }
2484
2485         if (home) {
2486                 *home_device = home;
2487                 home = NULL;
2488
2489                 *home_device_rw = home_rw;
2490         }
2491
2492         if (srv) {
2493                 *srv_device = srv;
2494                 srv = NULL;
2495
2496                 *srv_device_rw = srv_rw;
2497         }
2498
2499         return 0;
2500 #else
2501         log_error("--image= is not supported, compiled without blkid support.");
2502         return -ENOTSUP;
2503 #endif
2504 }
2505
2506 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2507 #ifdef HAVE_BLKID
2508         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2509         const char *fstype, *p;
2510         int r;
2511
2512         assert(what);
2513         assert(where);
2514
2515         if (arg_read_only)
2516                 rw = false;
2517
2518         if (directory)
2519                 p = strappenda(where, directory);
2520         else
2521                 p = where;
2522
2523         errno = 0;
2524         b = blkid_new_probe_from_filename(what);
2525         if (!b) {
2526                 if (errno == 0)
2527                         return log_oom();
2528                 log_error("Failed to allocate prober for %s: %m", what);
2529                 return -errno;
2530         }
2531
2532         blkid_probe_enable_superblocks(b, 1);
2533         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2534
2535         errno = 0;
2536         r = blkid_do_safeprobe(b);
2537         if (r == -1 || r == 1) {
2538                 log_error("Cannot determine file system type of %s", what);
2539                 return -EINVAL;
2540         } else if (r != 0) {
2541                 if (errno == 0)
2542                         errno = EIO;
2543                 log_error("Failed to probe %s: %m", what);
2544                 return -errno;
2545         }
2546
2547         errno = 0;
2548         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2549                 if (errno == 0)
2550                         errno = EINVAL;
2551                 log_error("Failed to determine file system type of %s", what);
2552                 return -errno;
2553         }
2554
2555         if (streq(fstype, "crypto_LUKS")) {
2556                 log_error("nspawn currently does not support LUKS disk images.");
2557                 return -ENOTSUP;
2558         }
2559
2560         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2561                 log_error("Failed to mount %s: %m", what);
2562                 return -errno;
2563         }
2564
2565         return 0;
2566 #else
2567         log_error("--image= is not supported, compiled without blkid support.");
2568         return -ENOTSUP;
2569 #endif
2570 }
2571
2572 static int mount_devices(
2573                 const char *where,
2574                 const char *root_device, bool root_device_rw,
2575                 const char *home_device, bool home_device_rw,
2576                 const char *srv_device, bool srv_device_rw) {
2577         int r;
2578
2579         assert(where);
2580
2581         if (root_device) {
2582                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2583                 if (r < 0) {
2584                         log_error("Failed to mount root directory: %s", strerror(-r));
2585                         return r;
2586                 }
2587         }
2588
2589         if (home_device) {
2590                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2591                 if (r < 0) {
2592                         log_error("Failed to mount home directory: %s", strerror(-r));
2593                         return r;
2594                 }
2595         }
2596
2597         if (srv_device) {
2598                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2599                 if (r < 0) {
2600                         log_error("Failed to mount server data directory: %s", strerror(-r));
2601                         return r;
2602                 }
2603         }
2604
2605         return 0;
2606 }
2607
2608 static void loop_remove(int nr, int *image_fd) {
2609         _cleanup_close_ int control = -1;
2610         int r;
2611
2612         if (nr < 0)
2613                 return;
2614
2615         if (image_fd && *image_fd >= 0) {
2616                 r = ioctl(*image_fd, LOOP_CLR_FD);
2617                 if (r < 0)
2618                         log_warning("Failed to close loop image: %m");
2619                 *image_fd = safe_close(*image_fd);
2620         }
2621
2622         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2623         if (control < 0) {
2624                 log_warning("Failed to open /dev/loop-control: %m");
2625                 return;
2626         }
2627
2628         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2629         if (r < 0)
2630                 log_warning("Failed to remove loop %d: %m", nr);
2631 }
2632
2633 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2634         int pipe_fds[2];
2635         pid_t pid;
2636
2637         assert(database);
2638         assert(key);
2639         assert(rpid);
2640
2641         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2642                 log_error("Failed to allocate pipe: %m");
2643                 return -errno;
2644         }
2645
2646         pid = fork();
2647         if (pid < 0) {
2648                 log_error("Failed to fork getent child: %m");
2649                 return -errno;
2650         } else if (pid == 0) {
2651                 int nullfd;
2652                 char *empty_env = NULL;
2653
2654                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2655                         _exit(EXIT_FAILURE);
2656
2657                 if (pipe_fds[0] > 2)
2658                         safe_close(pipe_fds[0]);
2659                 if (pipe_fds[1] > 2)
2660                         safe_close(pipe_fds[1]);
2661
2662                 nullfd = open("/dev/null", O_RDWR);
2663                 if (nullfd < 0)
2664                         _exit(EXIT_FAILURE);
2665
2666                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2667                         _exit(EXIT_FAILURE);
2668
2669                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2670                         _exit(EXIT_FAILURE);
2671
2672                 if (nullfd > 2)
2673                         safe_close(nullfd);
2674
2675                 reset_all_signal_handlers();
2676                 close_all_fds(NULL, 0);
2677
2678                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2679                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2680                 _exit(EXIT_FAILURE);
2681         }
2682
2683         pipe_fds[1] = safe_close(pipe_fds[1]);
2684
2685         *rpid = pid;
2686
2687         return pipe_fds[0];
2688 }
2689
2690 static int change_uid_gid(char **_home) {
2691         char line[LINE_MAX], *x, *u, *g, *h;
2692         const char *word, *state;
2693         _cleanup_free_ uid_t *uids = NULL;
2694         _cleanup_free_ char *home = NULL;
2695         _cleanup_fclose_ FILE *f = NULL;
2696         _cleanup_close_ int fd = -1;
2697         unsigned n_uids = 0;
2698         size_t sz = 0, l;
2699         uid_t uid;
2700         gid_t gid;
2701         pid_t pid;
2702         int r;
2703
2704         assert(_home);
2705
2706         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2707                 /* Reset everything fully to 0, just in case */
2708
2709                 if (setgroups(0, NULL) < 0) {
2710                         log_error("setgroups() failed: %m");
2711                         return -errno;
2712                 }
2713
2714                 if (setresgid(0, 0, 0) < 0) {
2715                         log_error("setregid() failed: %m");
2716                         return -errno;
2717                 }
2718
2719                 if (setresuid(0, 0, 0) < 0) {
2720                         log_error("setreuid() failed: %m");
2721                         return -errno;
2722                 }
2723
2724                 *_home = NULL;
2725                 return 0;
2726         }
2727
2728         /* First, get user credentials */
2729         fd = spawn_getent("passwd", arg_user, &pid);
2730         if (fd < 0)
2731                 return fd;
2732
2733         f = fdopen(fd, "r");
2734         if (!f)
2735                 return log_oom();
2736         fd = -1;
2737
2738         if (!fgets(line, sizeof(line), f)) {
2739
2740                 if (!ferror(f)) {
2741                         log_error("Failed to resolve user %s.", arg_user);
2742                         return -ESRCH;
2743                 }
2744
2745                 log_error("Failed to read from getent: %m");
2746                 return -errno;
2747         }
2748
2749         truncate_nl(line);
2750
2751         wait_for_terminate_and_warn("getent passwd", pid);
2752
2753         x = strchr(line, ':');
2754         if (!x) {
2755                 log_error("/etc/passwd entry has invalid user field.");
2756                 return -EIO;
2757         }
2758
2759         u = strchr(x+1, ':');
2760         if (!u) {
2761                 log_error("/etc/passwd entry has invalid password field.");
2762                 return -EIO;
2763         }
2764
2765         u++;
2766         g = strchr(u, ':');
2767         if (!g) {
2768                 log_error("/etc/passwd entry has invalid UID field.");
2769                 return -EIO;
2770         }
2771
2772         *g = 0;
2773         g++;
2774         x = strchr(g, ':');
2775         if (!x) {
2776                 log_error("/etc/passwd entry has invalid GID field.");
2777                 return -EIO;
2778         }
2779
2780         *x = 0;
2781         h = strchr(x+1, ':');
2782         if (!h) {
2783                 log_error("/etc/passwd entry has invalid GECOS field.");
2784                 return -EIO;
2785         }
2786
2787         h++;
2788         x = strchr(h, ':');
2789         if (!x) {
2790                 log_error("/etc/passwd entry has invalid home directory field.");
2791                 return -EIO;
2792         }
2793
2794         *x = 0;
2795
2796         r = parse_uid(u, &uid);
2797         if (r < 0) {
2798                 log_error("Failed to parse UID of user.");
2799                 return -EIO;
2800         }
2801
2802         r = parse_gid(g, &gid);
2803         if (r < 0) {
2804                 log_error("Failed to parse GID of user.");
2805                 return -EIO;
2806         }
2807
2808         home = strdup(h);
2809         if (!home)
2810                 return log_oom();
2811
2812         /* Second, get group memberships */
2813         fd = spawn_getent("initgroups", arg_user, &pid);
2814         if (fd < 0)
2815                 return fd;
2816
2817         fclose(f);
2818         f = fdopen(fd, "r");
2819         if (!f)
2820                 return log_oom();
2821         fd = -1;
2822
2823         if (!fgets(line, sizeof(line), f)) {
2824                 if (!ferror(f)) {
2825                         log_error("Failed to resolve user %s.", arg_user);
2826                         return -ESRCH;
2827                 }
2828
2829                 log_error("Failed to read from getent: %m");
2830                 return -errno;
2831         }
2832
2833         truncate_nl(line);
2834
2835         wait_for_terminate_and_warn("getent initgroups", pid);
2836
2837         /* Skip over the username and subsequent separator whitespace */
2838         x = line;
2839         x += strcspn(x, WHITESPACE);
2840         x += strspn(x, WHITESPACE);
2841
2842         FOREACH_WORD(word, l, x, state) {
2843                 char c[l+1];
2844
2845                 memcpy(c, word, l);
2846                 c[l] = 0;
2847
2848                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2849                         return log_oom();
2850
2851                 r = parse_uid(c, &uids[n_uids++]);
2852                 if (r < 0) {
2853                         log_error("Failed to parse group data from getent.");
2854                         return -EIO;
2855                 }
2856         }
2857
2858         r = mkdir_parents(home, 0775);
2859         if (r < 0) {
2860                 log_error("Failed to make home root directory: %s", strerror(-r));
2861                 return r;
2862         }
2863
2864         r = mkdir_safe(home, 0755, uid, gid);
2865         if (r < 0 && r != -EEXIST) {
2866                 log_error("Failed to make home directory: %s", strerror(-r));
2867                 return r;
2868         }
2869
2870         fchown(STDIN_FILENO, uid, gid);
2871         fchown(STDOUT_FILENO, uid, gid);
2872         fchown(STDERR_FILENO, uid, gid);
2873
2874         if (setgroups(n_uids, uids) < 0) {
2875                 log_error("Failed to set auxiliary groups: %m");
2876                 return -errno;
2877         }
2878
2879         if (setresgid(gid, gid, gid) < 0) {
2880                 log_error("setregid() failed: %m");
2881                 return -errno;
2882         }
2883
2884         if (setresuid(uid, uid, uid) < 0) {
2885                 log_error("setreuid() failed: %m");
2886                 return -errno;
2887         }
2888
2889         if (_home) {
2890                 *_home = home;
2891                 home = NULL;
2892         }
2893
2894         return 0;
2895 }
2896
2897 /*
2898  * Return values:
2899  * < 0 : wait_for_terminate() failed to get the state of the
2900  *       container, the container was terminated by a signal, or
2901  *       failed for an unknown reason.  No change is made to the
2902  *       container argument.
2903  * > 0 : The program executed in the container terminated with an
2904  *       error.  The exit code of the program executed in the
2905  *       container is returned.  No change is made to the container
2906  *       argument.
2907  *   0 : The container is being rebooted, has been shut down or exited
2908  *       successfully.  The container argument has been set to either
2909  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2910  *
2911  * That is, success is indicated by a return value of zero, and an
2912  * error is indicated by a non-zero value.
2913  */
2914 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2915         int r;
2916         siginfo_t status;
2917
2918         r = wait_for_terminate(pid, &status);
2919         if (r < 0) {
2920                 log_warning("Failed to wait for container: %s", strerror(-r));
2921                 return r;
2922         }
2923
2924         switch (status.si_code) {
2925         case CLD_EXITED:
2926                 r = status.si_status;
2927                 if (r == 0) {
2928                         if (!arg_quiet)
2929                                 log_debug("Container %s exited successfully.",
2930                                           arg_machine);
2931
2932                         *container = CONTAINER_TERMINATED;
2933                 } else {
2934                         log_error("Container %s failed with error code %i.",
2935                                   arg_machine, status.si_status);
2936                 }
2937                 break;
2938
2939         case CLD_KILLED:
2940                 if (status.si_status == SIGINT) {
2941                         if (!arg_quiet)
2942                                 log_info("Container %s has been shut down.",
2943                                          arg_machine);
2944
2945                         *container = CONTAINER_TERMINATED;
2946                         r = 0;
2947                         break;
2948                 } else if (status.si_status == SIGHUP) {
2949                         if (!arg_quiet)
2950                                 log_info("Container %s is being rebooted.",
2951                                          arg_machine);
2952
2953                         *container = CONTAINER_REBOOTED;
2954                         r = 0;
2955                         break;
2956                 }
2957                 /* CLD_KILLED fallthrough */
2958
2959         case CLD_DUMPED:
2960                 log_error("Container %s terminated by signal %s.",
2961                           arg_machine, signal_to_string(status.si_status));
2962                 r = -1;
2963                 break;
2964
2965         default:
2966                 log_error("Container %s failed due to unknown reason.",
2967                           arg_machine);
2968                 r = -1;
2969                 break;
2970         }
2971
2972         return r;
2973 }
2974
2975 static void nop_handler(int sig) {}
2976
2977 int main(int argc, char *argv[]) {
2978
2979         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2980         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2981         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2982         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2983         _cleanup_fdset_free_ FDSet *fds = NULL;
2984         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2985         const char *console = NULL;
2986         char veth_name[IFNAMSIZ];
2987         bool secondary = false;
2988         sigset_t mask, mask_chld;
2989         pid_t pid = 0;
2990
2991         log_parse_environment();
2992         log_open();
2993
2994         k = parse_argv(argc, argv);
2995         if (k < 0)
2996                 goto finish;
2997         else if (k == 0) {
2998                 r = EXIT_SUCCESS;
2999                 goto finish;
3000         }
3001
3002         if (!arg_image) {
3003                 if (arg_directory) {
3004                         char *p;
3005
3006                         p = path_make_absolute_cwd(arg_directory);
3007                         free(arg_directory);
3008                         arg_directory = p;
3009                 } else
3010                         arg_directory = get_current_dir_name();
3011
3012                 if (!arg_directory) {
3013                         log_error("Failed to determine path, please use -D.");
3014                         goto finish;
3015                 }
3016                 path_kill_slashes(arg_directory);
3017         }
3018
3019         if (!arg_machine) {
3020                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3021                 if (!arg_machine) {
3022                         log_oom();
3023                         goto finish;
3024                 }
3025
3026                 hostname_cleanup(arg_machine, false);
3027                 if (isempty(arg_machine)) {
3028                         log_error("Failed to determine machine name automatically, please use -M.");
3029                         goto finish;
3030                 }
3031         }
3032
3033         if (geteuid() != 0) {
3034                 log_error("Need to be root.");
3035                 goto finish;
3036         }
3037
3038         if (sd_booted() <= 0) {
3039                 log_error("Not running on a systemd system.");
3040                 goto finish;
3041         }
3042
3043         log_close();
3044         n_fd_passed = sd_listen_fds(false);
3045         if (n_fd_passed > 0) {
3046                 k = fdset_new_listen_fds(&fds, false);
3047                 if (k < 0) {
3048                         log_error("Failed to collect file descriptors: %s", strerror(-k));
3049                         goto finish;
3050                 }
3051         }
3052         fdset_close_others(fds);
3053         log_open();
3054
3055         if (arg_directory) {
3056                 if (path_equal(arg_directory, "/")) {
3057                         log_error("Spawning container on root directory not supported.");
3058                         goto finish;
3059                 }
3060
3061                 if (arg_boot) {
3062                         if (path_is_os_tree(arg_directory) <= 0) {
3063                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3064                                 goto finish;
3065                         }
3066                 } else {
3067                         const char *p;
3068
3069                         p = strappenda(arg_directory,
3070                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3071                         if (access(p, F_OK) < 0) {
3072                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3073                                 goto finish;
3074
3075                         }
3076                 }
3077         } else {
3078                 char template[] = "/tmp/nspawn-root-XXXXXX";
3079
3080                 if (!mkdtemp(template)) {
3081                         log_error("Failed to create temporary directory: %m");
3082                         r = -errno;
3083                         goto finish;
3084                 }
3085
3086                 arg_directory = strdup(template);
3087                 if (!arg_directory) {
3088                         r = log_oom();
3089                         goto finish;
3090                 }
3091
3092                 image_fd = setup_image(&device_path, &loop_nr);
3093                 if (image_fd < 0) {
3094                         r = image_fd;
3095                         goto finish;
3096                 }
3097
3098                 r = dissect_image(image_fd,
3099                                   &root_device, &root_device_rw,
3100                                   &home_device, &home_device_rw,
3101                                   &srv_device, &srv_device_rw,
3102                                   &secondary);
3103                 if (r < 0)
3104                         goto finish;
3105         }
3106
3107         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3108         if (master < 0) {
3109                 log_error("Failed to acquire pseudo tty: %m");
3110                 goto finish;
3111         }
3112
3113         console = ptsname(master);
3114         if (!console) {
3115                 log_error("Failed to determine tty name: %m");
3116                 goto finish;
3117         }
3118
3119         if (!arg_quiet)
3120                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3121                          arg_machine, arg_image ? arg_image : arg_directory);
3122
3123         if (unlockpt(master) < 0) {
3124                 log_error("Failed to unlock tty: %m");
3125                 goto finish;
3126         }
3127
3128         if (access("/dev/kdbus/control", F_OK) >= 0) {
3129
3130                 if (arg_share_system) {
3131                         kdbus_domain = strdup("/dev/kdbus");
3132                         if (!kdbus_domain) {
3133                                 log_oom();
3134                                 goto finish;
3135                         }
3136                 } else {
3137                         const char *ns;
3138
3139                         ns = strappenda("machine-", arg_machine);
3140                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3141                         if (r < 0)
3142                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3143                         else
3144                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3145                 }
3146         }
3147
3148         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3149                 log_error("Failed to create kmsg socket pair: %m");
3150                 goto finish;
3151         }
3152
3153         sd_notify(false,
3154                   "READY=1\n"
3155                   "STATUS=Container running.");
3156
3157         assert_se(sigemptyset(&mask) == 0);
3158         assert_se(sigemptyset(&mask_chld) == 0);
3159         sigaddset(&mask_chld, SIGCHLD);
3160         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3161         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3162
3163         for (;;) {
3164                 ContainerStatus container_status;
3165                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3166                 struct sigaction sa = {
3167                         .sa_handler = nop_handler,
3168                         .sa_flags = SA_NOCLDSTOP,
3169                 };
3170
3171                 r = barrier_create(&barrier);
3172                 if (r < 0) {
3173                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3174                         goto finish;
3175                 }
3176
3177                 /* Child can be killed before execv(), so handle SIGCHLD
3178                  * in order to interrupt parent's blocking calls and
3179                  * give it a chance to call wait() and terminate. */
3180                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3181                 if (r < 0) {
3182                         log_error("Failed to change the signal mask: %m");
3183                         goto finish;
3184                 }
3185
3186                 r = sigaction(SIGCHLD, &sa, NULL);
3187                 if (r < 0) {
3188                         log_error("Failed to install SIGCHLD handler: %m");
3189                         goto finish;
3190                 }
3191
3192                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3193                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3194                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3195                 if (pid < 0) {
3196                         if (errno == EINVAL)
3197                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3198                         else
3199                                 log_error("clone() failed: %m");
3200
3201                         r = pid;
3202                         goto finish;
3203                 }
3204
3205                 if (pid == 0) {
3206                         /* child */
3207                         _cleanup_free_ char *home = NULL;
3208                         unsigned n_env = 2;
3209                         const char *envp[] = {
3210                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3211                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3212                                 NULL, /* TERM */
3213                                 NULL, /* HOME */
3214                                 NULL, /* USER */
3215                                 NULL, /* LOGNAME */
3216                                 NULL, /* container_uuid */
3217                                 NULL, /* LISTEN_FDS */
3218                                 NULL, /* LISTEN_PID */
3219                                 NULL
3220                         };
3221                         char **env_use;
3222
3223                         barrier_set_role(&barrier, BARRIER_CHILD);
3224
3225                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3226                         if (envp[n_env])
3227                                 n_env ++;
3228
3229                         master = safe_close(master);
3230
3231                         close_nointr(STDIN_FILENO);
3232                         close_nointr(STDOUT_FILENO);
3233                         close_nointr(STDERR_FILENO);
3234
3235                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3236
3237                         reset_all_signal_handlers();
3238                         reset_signal_mask();
3239
3240                         k = open_terminal(console, O_RDWR);
3241                         if (k != STDIN_FILENO) {
3242                                 if (k >= 0) {
3243                                         safe_close(k);
3244                                         k = -EINVAL;
3245                                 }
3246
3247                                 log_error("Failed to open console: %s", strerror(-k));
3248                                 _exit(EXIT_FAILURE);
3249                         }
3250
3251                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3252                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3253                                 log_error("Failed to duplicate console: %m");
3254                                 _exit(EXIT_FAILURE);
3255                         }
3256
3257                         if (setsid() < 0) {
3258                                 log_error("setsid() failed: %m");
3259                                 _exit(EXIT_FAILURE);
3260                         }
3261
3262                         if (reset_audit_loginuid() < 0)
3263                                 _exit(EXIT_FAILURE);
3264
3265                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3266                                 log_error("PR_SET_PDEATHSIG failed: %m");
3267                                 _exit(EXIT_FAILURE);
3268                         }
3269
3270                         /* Mark everything as slave, so that we still
3271                          * receive mounts from the real root, but don't
3272                          * propagate mounts to the real root. */
3273                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3274                                 log_error("MS_SLAVE|MS_REC failed: %m");
3275                                 _exit(EXIT_FAILURE);
3276                         }
3277
3278                         if (mount_devices(arg_directory,
3279                                           root_device, root_device_rw,
3280                                           home_device, home_device_rw,
3281                                           srv_device, srv_device_rw) < 0)
3282                                 _exit(EXIT_FAILURE);
3283
3284                         /* Turn directory into bind mount */
3285                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3286                                 log_error("Failed to make bind mount: %m");
3287                                 _exit(EXIT_FAILURE);
3288                         }
3289
3290                         r = setup_volatile(arg_directory);
3291                         if (r < 0)
3292                                 _exit(EXIT_FAILURE);
3293
3294                         if (setup_volatile_state(arg_directory) < 0)
3295                                 _exit(EXIT_FAILURE);
3296
3297                         r = base_filesystem_create(arg_directory);
3298                         if (r < 0)
3299                                 _exit(EXIT_FAILURE);
3300
3301                         if (arg_read_only) {
3302                                 k = bind_remount_recursive(arg_directory, true);
3303                                 if (k < 0) {
3304                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3305                                         _exit(EXIT_FAILURE);
3306                                 }
3307                         }
3308
3309                         if (mount_all(arg_directory) < 0)
3310                                 _exit(EXIT_FAILURE);
3311
3312                         if (copy_devnodes(arg_directory) < 0)
3313                                 _exit(EXIT_FAILURE);
3314
3315                         if (setup_ptmx(arg_directory) < 0)
3316                                 _exit(EXIT_FAILURE);
3317
3318                         dev_setup(arg_directory);
3319
3320                         if (setup_seccomp() < 0)
3321                                 _exit(EXIT_FAILURE);
3322
3323                         if (setup_dev_console(arg_directory, console) < 0)
3324                                 _exit(EXIT_FAILURE);
3325
3326                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3327                                 _exit(EXIT_FAILURE);
3328
3329                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3330
3331                         if (setup_boot_id(arg_directory) < 0)
3332                                 _exit(EXIT_FAILURE);
3333
3334                         if (setup_timezone(arg_directory) < 0)
3335                                 _exit(EXIT_FAILURE);
3336
3337                         if (setup_resolv_conf(arg_directory) < 0)
3338                                 _exit(EXIT_FAILURE);
3339
3340                         if (setup_journal(arg_directory) < 0)
3341                                 _exit(EXIT_FAILURE);
3342
3343                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3344                                 _exit(EXIT_FAILURE);
3345
3346                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3347                                 _exit(EXIT_FAILURE);
3348
3349                         if (mount_tmpfs(arg_directory) < 0)
3350                                 _exit(EXIT_FAILURE);
3351
3352                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3353                                 _exit(EXIT_FAILURE);
3354
3355                         /* Tell the parent that we are ready, and that
3356                          * it can cgroupify us to that we lack access
3357                          * to certain devices and resources. */
3358                         barrier_place(&barrier);
3359
3360                         if (chdir(arg_directory) < 0) {
3361                                 log_error("chdir(%s) failed: %m", arg_directory);
3362                                 _exit(EXIT_FAILURE);
3363                         }
3364
3365                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3366                                 log_error("mount(MS_MOVE) failed: %m");
3367                                 _exit(EXIT_FAILURE);
3368                         }
3369
3370                         if (chroot(".") < 0) {
3371                                 log_error("chroot() failed: %m");
3372                                 _exit(EXIT_FAILURE);
3373                         }
3374
3375                         if (chdir("/") < 0) {
3376                                 log_error("chdir() failed: %m");
3377                                 _exit(EXIT_FAILURE);
3378                         }
3379
3380                         umask(0022);
3381
3382                         if (arg_private_network)
3383                                 loopback_setup();
3384
3385                         if (drop_capabilities() < 0) {
3386                                 log_error("drop_capabilities() failed: %m");
3387                                 _exit(EXIT_FAILURE);
3388                         }
3389
3390                         r = change_uid_gid(&home);
3391                         if (r < 0)
3392                                 _exit(EXIT_FAILURE);
3393
3394                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3395                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3396                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3397                                 log_oom();
3398                                 _exit(EXIT_FAILURE);
3399                         }
3400
3401                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3402                                 char as_uuid[37];
3403
3404                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3405                                         log_oom();
3406                                         _exit(EXIT_FAILURE);
3407                                 }
3408                         }
3409
3410                         if (fdset_size(fds) > 0) {
3411                                 k = fdset_cloexec(fds, false);
3412                                 if (k < 0) {
3413                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3414                                         _exit(EXIT_FAILURE);
3415                                 }
3416
3417                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3418                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3419                                         log_oom();
3420                                         _exit(EXIT_FAILURE);
3421                                 }
3422                         }
3423
3424                         setup_hostname();
3425
3426                         if (arg_personality != 0xffffffffLU) {
3427                                 if (personality(arg_personality) < 0) {
3428                                         log_error("personality() failed: %m");
3429                                         _exit(EXIT_FAILURE);
3430                                 }
3431                         } else if (secondary) {
3432                                 if (personality(PER_LINUX32) < 0) {
3433                                         log_error("personality() failed: %m");
3434                                         _exit(EXIT_FAILURE);
3435                                 }
3436                         }
3437
3438 #ifdef HAVE_SELINUX
3439                         if (arg_selinux_context)
3440                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3441                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3442                                         _exit(EXIT_FAILURE);
3443                                 }
3444 #endif
3445
3446                         if (!strv_isempty(arg_setenv)) {
3447                                 char **n;
3448
3449                                 n = strv_env_merge(2, envp, arg_setenv);
3450                                 if (!n) {
3451                                         log_oom();
3452                                         _exit(EXIT_FAILURE);
3453                                 }
3454
3455                                 env_use = n;
3456                         } else
3457                                 env_use = (char**) envp;
3458
3459                         /* Wait until the parent is ready with the setup, too... */
3460                         if (!barrier_place_and_sync(&barrier))
3461                                 _exit(EXIT_FAILURE);
3462
3463                         if (arg_boot) {
3464                                 char **a;
3465                                 size_t l;
3466
3467                                 /* Automatically search for the init system */
3468
3469                                 l = 1 + argc - optind;
3470                                 a = newa(char*, l + 1);
3471                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3472
3473                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3474                                 execve(a[0], a, env_use);
3475
3476                                 a[0] = (char*) "/lib/systemd/systemd";
3477                                 execve(a[0], a, env_use);
3478
3479                                 a[0] = (char*) "/sbin/init";
3480                                 execve(a[0], a, env_use);
3481                         } else if (argc > optind)
3482                                 execvpe(argv[optind], argv + optind, env_use);
3483                         else {
3484                                 chdir(home ? home : "/root");
3485                                 execle("/bin/bash", "-bash", NULL, env_use);
3486                                 execle("/bin/sh", "-sh", NULL, env_use);
3487                         }
3488
3489                         log_error("execv() failed: %m");
3490                         _exit(EXIT_FAILURE);
3491                 }
3492
3493                 barrier_set_role(&barrier, BARRIER_PARENT);
3494                 fdset_free(fds);
3495                 fds = NULL;
3496
3497                 /* wait for child-setup to be done */
3498                 if (barrier_place_and_sync(&barrier)) {
3499                         int ifi = 0;
3500
3501                         r = move_network_interfaces(pid);
3502                         if (r < 0)
3503                                 goto finish;
3504
3505                         r = setup_veth(pid, veth_name, &ifi);
3506                         if (r < 0)
3507                                 goto finish;
3508
3509                         r = setup_bridge(veth_name, &ifi);
3510                         if (r < 0)
3511                                 goto finish;
3512
3513                         r = setup_macvlan(pid);
3514                         if (r < 0)
3515                                 goto finish;
3516
3517                         r = register_machine(pid, ifi);
3518                         if (r < 0)
3519                                 goto finish;
3520
3521                         /* Block SIGCHLD here, before notifying child.
3522                          * process_pty() will handle it with the other signals. */
3523                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3524                         if (r < 0)
3525                                 goto finish;
3526
3527                         /* Reset signal to default */
3528                         r = default_signals(SIGCHLD, -1);
3529                         if (r < 0)
3530                                 goto finish;
3531
3532                         /* Notify the child that the parent is ready with all
3533                          * its setup, and that the child can now hand over
3534                          * control to the code to run inside the container. */
3535                         barrier_place(&barrier);
3536
3537                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3538                         if (k < 0) {
3539                                 r = EXIT_FAILURE;
3540                                 break;
3541                         }
3542
3543                         if (!arg_quiet)
3544                                 putc('\n', stdout);
3545
3546                         /* Kill if it is not dead yet anyway */
3547                         terminate_machine(pid);
3548                 }
3549
3550                 /* Normally redundant, but better safe than sorry */
3551                 kill(pid, SIGKILL);
3552
3553                 r = wait_for_container(pid, &container_status);
3554                 pid = 0;
3555
3556                 if (r < 0) {
3557                         /* We failed to wait for the container, or the
3558                          * container exited abnormally */
3559                         r = EXIT_FAILURE;
3560                         break;
3561                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3562                         /* The container exited with a non-zero
3563                          * status, or with zero status and no reboot
3564                          * was requested. */
3565                         break;
3566
3567                 /* CONTAINER_REBOOTED, loop again */
3568
3569                 if (arg_keep_unit) {
3570                         /* Special handling if we are running as a
3571                          * service: instead of simply restarting the
3572                          * machine we want to restart the entire
3573                          * service, so let's inform systemd about this
3574                          * with the special exit code 133. The service
3575                          * file uses RestartForceExitStatus=133 so
3576                          * that this results in a full nspawn
3577                          * restart. This is necessary since we might
3578                          * have cgroup parameters set we want to have
3579                          * flushed out. */
3580                         r = 133;
3581                         break;
3582                 }
3583         }
3584
3585 finish:
3586         sd_notify(false,
3587                   "STOPPING=1\n"
3588                   "STATUS=Terminating...");
3589
3590         loop_remove(loop_nr, &image_fd);
3591
3592         if (pid > 0)
3593                 kill(pid, SIGKILL);
3594
3595         free(arg_directory);
3596         free(arg_machine);
3597         free(arg_user);
3598         strv_free(arg_setenv);
3599         strv_free(arg_network_interfaces);
3600         strv_free(arg_network_macvlan);
3601         strv_free(arg_bind);
3602         strv_free(arg_bind_ro);
3603         strv_free(arg_tmpfs);
3604
3605         return r;
3606 }