chiark / gitweb /
nspawn: actually allow access to /dev/net/tun in the container
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172                "  -h --help                 Show this help\n"
173                "     --version              Print version string\n"
174                "  -q --quiet                Do not show status information\n"
175                "  -D --directory=PATH       Root directory for the container\n"
176                "  -i --image=PATH           File system device or image for the container\n"
177                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
178                "  -u --user=USER            Run the command under specified user or uid\n"
179                "  -M --machine=NAME         Set the machine name for the container\n"
180                "     --uuid=UUID            Set a specific machine UUID for the container\n"
181                "  -S --slice=SLICE          Place the container in the specified slice\n"
182                "     --private-network      Disable network in container\n"
183                "     --network-interface=INTERFACE\n"
184                "                            Assign an existing network interface to the\n"
185                "                            container\n"
186                "     --network-macvlan=INTERFACE\n"
187                "                            Create a macvlan network interface based on an\n"
188                "                            existing network interface to the container\n"
189                "     --network-veth         Add a virtual ethernet connection between host\n"
190                "                            and container\n"
191                "     --network-bridge=INTERFACE\n"
192                "                            Add a virtual ethernet connection between host\n"
193                "                            and container and add it to an existing bridge on\n"
194                "                            the host\n"
195                "  -Z --selinux-context=SECLABEL\n"
196                "                            Set the SELinux security context to be used by\n"
197                "                            processes in the container\n"
198                "  -L --selinux-apifs-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            API/tmpfs file systems in the container\n"
201                "     --capability=CAP       In addition to the default, retain specified\n"
202                "                            capability\n"
203                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
204                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
205                "  -j                        Equivalent to --link-journal=host\n"
206                "     --read-only            Mount the root directory read-only\n"
207                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
208                "                            the container\n"
209                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
210                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
212                "     --share-system         Share system namespaces with host\n"
213                "     --register=BOOLEAN     Register container as machine\n"
214                "     --keep-unit            Do not register a scope for the machine, reuse\n"
215                "                            the service unit nspawn is running in\n"
216                "     --volatile[=MODE]      Run the system in volatile mode\n",
217                program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222         enum {
223                 ARG_VERSION = 0x100,
224                 ARG_PRIVATE_NETWORK,
225                 ARG_UUID,
226                 ARG_READ_ONLY,
227                 ARG_CAPABILITY,
228                 ARG_DROP_CAPABILITY,
229                 ARG_LINK_JOURNAL,
230                 ARG_BIND,
231                 ARG_BIND_RO,
232                 ARG_TMPFS,
233                 ARG_SETENV,
234                 ARG_SHARE_SYSTEM,
235                 ARG_REGISTER,
236                 ARG_KEEP_UNIT,
237                 ARG_NETWORK_INTERFACE,
238                 ARG_NETWORK_MACVLAN,
239                 ARG_NETWORK_VETH,
240                 ARG_NETWORK_BRIDGE,
241                 ARG_PERSONALITY,
242                 ARG_VOLATILE,
243         };
244
245         static const struct option options[] = {
246                 { "help",                  no_argument,       NULL, 'h'                   },
247                 { "version",               no_argument,       NULL, ARG_VERSION           },
248                 { "directory",             required_argument, NULL, 'D'                   },
249                 { "user",                  required_argument, NULL, 'u'                   },
250                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
251                 { "boot",                  no_argument,       NULL, 'b'                   },
252                 { "uuid",                  required_argument, NULL, ARG_UUID              },
253                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
254                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
255                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
256                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
257                 { "bind",                  required_argument, NULL, ARG_BIND              },
258                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
259                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
260                 { "machine",               required_argument, NULL, 'M'                   },
261                 { "slice",                 required_argument, NULL, 'S'                   },
262                 { "setenv",                required_argument, NULL, ARG_SETENV            },
263                 { "selinux-context",       required_argument, NULL, 'Z'                   },
264                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
265                 { "quiet",                 no_argument,       NULL, 'q'                   },
266                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
267                 { "register",              required_argument, NULL, ARG_REGISTER          },
268                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
269                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
270                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
271                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
272                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
273                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
274                 { "image",                 required_argument, NULL, 'i'                   },
275                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
276                 {}
277         };
278
279         int c, r;
280         uint64_t plus = 0, minus = 0;
281
282         assert(argc >= 0);
283         assert(argv);
284
285         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287                 switch (c) {
288
289                 case 'h':
290                         help();
291                         return 0;
292
293                 case ARG_VERSION:
294                         puts(PACKAGE_STRING);
295                         puts(SYSTEMD_FEATURES);
296                         return 0;
297
298                 case 'D':
299                         free(arg_directory);
300                         arg_directory = canonicalize_file_name(optarg);
301                         if (!arg_directory) {
302                                 log_error("Invalid root directory: %m");
303                                 return -ENOMEM;
304                         }
305
306                         break;
307
308                 case 'i':
309                         arg_image = optarg;
310                         break;
311
312                 case 'u':
313                         free(arg_user);
314                         arg_user = strdup(optarg);
315                         if (!arg_user)
316                                 return log_oom();
317
318                         break;
319
320                 case ARG_NETWORK_BRIDGE:
321                         arg_network_bridge = optarg;
322
323                         /* fall through */
324
325                 case ARG_NETWORK_VETH:
326                         arg_network_veth = true;
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_INTERFACE:
331                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
332                                 return log_oom();
333
334                         arg_private_network = true;
335                         break;
336
337                 case ARG_NETWORK_MACVLAN:
338                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
339                                 return log_oom();
340
341                         /* fall through */
342
343                 case ARG_PRIVATE_NETWORK:
344                         arg_private_network = true;
345                         break;
346
347                 case 'b':
348                         arg_boot = true;
349                         break;
350
351                 case ARG_UUID:
352                         r = sd_id128_from_string(optarg, &arg_uuid);
353                         if (r < 0) {
354                                 log_error("Invalid UUID: %s", optarg);
355                                 return r;
356                         }
357                         break;
358
359                 case 'S':
360                         arg_slice = optarg;
361                         break;
362
363                 case 'M':
364                         if (isempty(optarg)) {
365                                 free(arg_machine);
366                                 arg_machine = NULL;
367                         } else {
368
369                                 if (!hostname_is_valid(optarg)) {
370                                         log_error("Invalid machine name: %s", optarg);
371                                         return -EINVAL;
372                                 }
373
374                                 free(arg_machine);
375                                 arg_machine = strdup(optarg);
376                                 if (!arg_machine)
377                                         return log_oom();
378
379                                 break;
380                         }
381
382                 case 'Z':
383                         arg_selinux_context = optarg;
384                         break;
385
386                 case 'L':
387                         arg_selinux_apifs_context = optarg;
388                         break;
389
390                 case ARG_READ_ONLY:
391                         arg_read_only = true;
392                         break;
393
394                 case ARG_CAPABILITY:
395                 case ARG_DROP_CAPABILITY: {
396                         const char *state, *word;
397                         size_t length;
398
399                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400                                 _cleanup_free_ char *t;
401                                 cap_value_t cap;
402
403                                 t = strndup(word, length);
404                                 if (!t)
405                                         return log_oom();
406
407                                 if (streq(t, "all")) {
408                                         if (c == ARG_CAPABILITY)
409                                                 plus = (uint64_t) -1;
410                                         else
411                                                 minus = (uint64_t) -1;
412                                 } else {
413                                         if (cap_from_name(t, &cap) < 0) {
414                                                 log_error("Failed to parse capability %s.", t);
415                                                 return -EINVAL;
416                                         }
417
418                                         if (c == ARG_CAPABILITY)
419                                                 plus |= 1ULL << (uint64_t) cap;
420                                         else
421                                                 minus |= 1ULL << (uint64_t) cap;
422                                 }
423                         }
424
425                         break;
426                 }
427
428                 case 'j':
429                         arg_link_journal = LINK_GUEST;
430                         break;
431
432                 case ARG_LINK_JOURNAL:
433                         if (streq(optarg, "auto"))
434                                 arg_link_journal = LINK_AUTO;
435                         else if (streq(optarg, "no"))
436                                 arg_link_journal = LINK_NO;
437                         else if (streq(optarg, "guest"))
438                                 arg_link_journal = LINK_GUEST;
439                         else if (streq(optarg, "host"))
440                                 arg_link_journal = LINK_HOST;
441                         else {
442                                 log_error("Failed to parse link journal mode %s", optarg);
443                                 return -EINVAL;
444                         }
445
446                         break;
447
448                 case ARG_BIND:
449                 case ARG_BIND_RO: {
450                         _cleanup_free_ char *a = NULL, *b = NULL;
451                         char *e;
452                         char ***x;
453
454                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456                         e = strchr(optarg, ':');
457                         if (e) {
458                                 a = strndup(optarg, e - optarg);
459                                 b = strdup(e + 1);
460                         } else {
461                                 a = strdup(optarg);
462                                 b = strdup(optarg);
463                         }
464
465                         if (!a || !b)
466                                 return log_oom();
467
468                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
469                                 log_error("Invalid bind mount specification: %s", optarg);
470                                 return -EINVAL;
471                         }
472
473                         r = strv_extend(x, a);
474                         if (r < 0)
475                                 return log_oom();
476
477                         r = strv_extend(x, b);
478                         if (r < 0)
479                                 return log_oom();
480
481                         break;
482                 }
483
484                 case ARG_TMPFS: {
485                         _cleanup_free_ char *a = NULL, *b = NULL;
486                         char *e;
487
488                         e = strchr(optarg, ':');
489                         if (e) {
490                                 a = strndup(optarg, e - optarg);
491                                 b = strdup(e + 1);
492                         } else {
493                                 a = strdup(optarg);
494                                 b = strdup("mode=0755");
495                         }
496
497                         if (!a || !b)
498                                 return log_oom();
499
500                         if (!path_is_absolute(a)) {
501                                 log_error("Invalid tmpfs specification: %s", optarg);
502                                 return -EINVAL;
503                         }
504
505                         r = strv_push(&arg_tmpfs, a);
506                         if (r < 0)
507                                 return log_oom();
508
509                         a = NULL;
510
511                         r = strv_push(&arg_tmpfs, b);
512                         if (r < 0)
513                                 return log_oom();
514
515                         b = NULL;
516
517                         break;
518                 }
519
520                 case ARG_SETENV: {
521                         char **n;
522
523                         if (!env_assignment_is_valid(optarg)) {
524                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
525                                 return -EINVAL;
526                         }
527
528                         n = strv_env_set(arg_setenv, optarg);
529                         if (!n)
530                                 return log_oom();
531
532                         strv_free(arg_setenv);
533                         arg_setenv = n;
534                         break;
535                 }
536
537                 case 'q':
538                         arg_quiet = true;
539                         break;
540
541                 case ARG_SHARE_SYSTEM:
542                         arg_share_system = true;
543                         break;
544
545                 case ARG_REGISTER:
546                         r = parse_boolean(optarg);
547                         if (r < 0) {
548                                 log_error("Failed to parse --register= argument: %s", optarg);
549                                 return r;
550                         }
551
552                         arg_register = r;
553                         break;
554
555                 case ARG_KEEP_UNIT:
556                         arg_keep_unit = true;
557                         break;
558
559                 case ARG_PERSONALITY:
560
561                         arg_personality = personality_from_string(optarg);
562                         if (arg_personality == 0xffffffffLU) {
563                                 log_error("Unknown or unsupported personality '%s'.", optarg);
564                                 return -EINVAL;
565                         }
566
567                         break;
568
569                 case ARG_VOLATILE:
570
571                         if (!optarg)
572                                 arg_volatile = VOLATILE_YES;
573                         else {
574                                 r = parse_boolean(optarg);
575                                 if (r < 0) {
576                                         if (streq(optarg, "state"))
577                                                 arg_volatile = VOLATILE_STATE;
578                                         else {
579                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
580                                                 return r;
581                                         }
582                                 } else
583                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584                         }
585
586                         break;
587
588                 case '?':
589                         return -EINVAL;
590
591                 default:
592                         assert_not_reached("Unhandled option");
593                 }
594
595         if (arg_share_system)
596                 arg_register = false;
597
598         if (arg_boot && arg_share_system) {
599                 log_error("--boot and --share-system may not be combined.");
600                 return -EINVAL;
601         }
602
603         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604                 log_error("--keep-unit may not be used when invoked from a user session.");
605                 return -EINVAL;
606         }
607
608         if (arg_directory && arg_image) {
609                 log_error("--directory= and --image= may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_volatile != VOLATILE_NO && arg_read_only) {
614                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615                 return -EINVAL;
616         }
617
618         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620         return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625         typedef struct MountPoint {
626                 const char *what;
627                 const char *where;
628                 const char *type;
629                 const char *options;
630                 unsigned long flags;
631                 bool fatal;
632         } MountPoint;
633
634         static const MountPoint mount_table[] = {
635                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
636                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
637                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
638                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
639                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
640                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
642                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643 #ifdef HAVE_SELINUX
644                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
645                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
646 #endif
647         };
648
649         unsigned k;
650         int r = 0;
651
652         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653                 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655                 _cleanup_free_ char *options = NULL;
656 #endif
657                 const char *o;
658                 int t;
659
660                 where = strjoin(dest, "/", mount_table[k].where, NULL);
661                 if (!where)
662                         return log_oom();
663
664                 t = path_is_mount_point(where, true);
665                 if (t < 0) {
666                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668                         if (r == 0)
669                                 r = t;
670
671                         continue;
672                 }
673
674                 /* Skip this entry if it is not a remount. */
675                 if (mount_table[k].what && t > 0)
676                         continue;
677
678                 t = mkdir_p(where, 0755);
679                 if (t < 0) {
680                         if (mount_table[k].fatal) {
681                                log_error("Failed to create directory %s: %s", where, strerror(-t));
682
683                                 if (r == 0)
684                                         r = t;
685                         } else
686                                log_warning("Failed to create directory %s: %s", where, strerror(-t));
687
688                         continue;
689                 }
690
691 #ifdef HAVE_SELINUX
692                 if (arg_selinux_apifs_context &&
693                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
694                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
695                         if (!options)
696                                 return log_oom();
697
698                         o = options;
699                 } else
700 #endif
701                         o = mount_table[k].options;
702
703
704                 if (mount(mount_table[k].what,
705                           where,
706                           mount_table[k].type,
707                           mount_table[k].flags,
708                           o) < 0) {
709
710                         if (mount_table[k].fatal) {
711                                 log_error("mount(%s) failed: %m", where);
712
713                                 if (r == 0)
714                                         r = -errno;
715                         } else
716                                 log_warning("mount(%s) failed: %m", where);
717                 }
718         }
719
720         return r;
721 }
722
723 static int mount_binds(const char *dest, char **l, bool ro) {
724         char **x, **y;
725
726         STRV_FOREACH_PAIR(x, y, l) {
727                 _cleanup_free_ char *where = NULL;
728                 struct stat source_st, dest_st;
729                 int r;
730
731                 if (stat(*x, &source_st) < 0) {
732                         log_error("Failed to stat %s: %m", *x);
733                         return -errno;
734                 }
735
736                 where = strappend(dest, *y);
737                 if (!where)
738                         return log_oom();
739
740                 r = stat(where, &dest_st);
741                 if (r == 0) {
742                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
743                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
744                                 return -EINVAL;
745                         }
746                 } else if (errno == ENOENT) {
747                         r = mkdir_parents_label(where, 0755);
748                         if (r < 0) {
749                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
750                                 return r;
751                         }
752                 } else {
753                         log_error("Failed to bind mount %s: %m", *x);
754                         return -errno;
755                 }
756
757                 /* Create the mount point, but be conservative -- refuse to create block
758                  * and char devices. */
759                 if (S_ISDIR(source_st.st_mode)) {
760                         r = mkdir_label(where, 0755);
761                         if (r < 0) {
762                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
763
764                                 return r;
765                         }
766                 } else if (S_ISFIFO(source_st.st_mode)) {
767                         r = mkfifo(where, 0644);
768                         if (r < 0 && errno != EEXIST) {
769                                 log_error("Failed to create mount point %s: %m", where);
770
771                                 return -errno;
772                         }
773                 } else if (S_ISSOCK(source_st.st_mode)) {
774                         r = mknod(where, 0644 | S_IFSOCK, 0);
775                         if (r < 0 && errno != EEXIST) {
776                                 log_error("Failed to create mount point %s: %m", where);
777
778                                 return -errno;
779                         }
780                 } else if (S_ISREG(source_st.st_mode)) {
781                         r = touch(where);
782                         if (r < 0) {
783                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
784
785                                 return r;
786                         }
787                 } else {
788                         log_error("Refusing to create mountpoint for file: %s", *x);
789                         return -ENOTSUP;
790                 }
791
792                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
793                         log_error("mount(%s) failed: %m", where);
794                         return -errno;
795                 }
796
797                 if (ro) {
798                         r = bind_remount_recursive(where, true);
799                         if (r < 0) {
800                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
801                                 return r;
802                         }
803                 }
804         }
805
806         return 0;
807 }
808
809 static int mount_tmpfs(const char *dest) {
810         char **i, **o;
811
812         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813                 _cleanup_free_ char *where = NULL;
814                 int r;
815
816                 where = strappend(dest, *i);
817                 if (!where)
818                         return log_oom();
819
820                 r = mkdir_label(where, 0755);
821                 if (r < 0) {
822                         log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
823
824                         return r;
825                 }
826
827                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
828                         log_error("tmpfs mount to %s failed: %m", where);
829                         return -errno;
830                 }
831         }
832
833         return 0;
834 }
835
836 static int setup_timezone(const char *dest) {
837         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
838         char *z, *y;
839         int r;
840
841         assert(dest);
842
843         /* Fix the timezone, if possible */
844         r = readlink_malloc("/etc/localtime", &p);
845         if (r < 0) {
846                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
847                 return 0;
848         }
849
850         z = path_startswith(p, "../usr/share/zoneinfo/");
851         if (!z)
852                 z = path_startswith(p, "/usr/share/zoneinfo/");
853         if (!z) {
854                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
855                 return 0;
856         }
857
858         where = strappend(dest, "/etc/localtime");
859         if (!where)
860                 return log_oom();
861
862         r = readlink_malloc(where, &q);
863         if (r >= 0) {
864                 y = path_startswith(q, "../usr/share/zoneinfo/");
865                 if (!y)
866                         y = path_startswith(q, "/usr/share/zoneinfo/");
867
868                 /* Already pointing to the right place? Then do nothing .. */
869                 if (y && streq(y, z))
870                         return 0;
871         }
872
873         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
874         if (!check)
875                 return log_oom();
876
877         if (access(check, F_OK) < 0) {
878                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
879                 return 0;
880         }
881
882         what = strappend("../usr/share/zoneinfo/", z);
883         if (!what)
884                 return log_oom();
885
886         r = mkdir_parents(where, 0755);
887         if (r < 0) {
888                 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
889
890                 return 0;
891         }
892
893         r = unlink(where);
894         if (r < 0 && errno != ENOENT) {
895                 log_error("Failed to remove existing timezone info %s in container: %m", where);
896
897                 return 0;
898         }
899
900         if (symlink(what, where) < 0) {
901                 log_error("Failed to correct timezone of container: %m");
902                 return 0;
903         }
904
905         return 0;
906 }
907
908 static int setup_resolv_conf(const char *dest) {
909         _cleanup_free_ char *where = NULL;
910         int r;
911
912         assert(dest);
913
914         if (arg_private_network)
915                 return 0;
916
917         /* Fix resolv.conf, if possible */
918         where = strappend(dest, "/etc/resolv.conf");
919         if (!where)
920                 return log_oom();
921
922         /* We don't really care for the results of this really. If it
923          * fails, it fails, but meh... */
924         r = mkdir_parents(where, 0755);
925         if (r < 0) {
926                 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
927
928                 return 0;
929         }
930
931         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
932         if (r < 0) {
933                 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
934
935                 return 0;
936         }
937
938         return 0;
939 }
940
941 static int setup_volatile_state(const char *directory) {
942         const char *p;
943         int r;
944
945         assert(directory);
946
947         if (arg_volatile != VOLATILE_STATE)
948                 return 0;
949
950         /* --volatile=state means we simply overmount /var
951            with a tmpfs, and the rest read-only. */
952
953         r = bind_remount_recursive(directory, true);
954         if (r < 0) {
955                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
956                 return r;
957         }
958
959         p = strappenda(directory, "/var");
960         r = mkdir(p, 0755);
961         if (r < 0 && errno != EEXIST) {
962                 log_error("Failed to create %s: %m", directory);
963                 return -errno;
964         }
965
966         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
967                 log_error("Failed to mount tmpfs to /var: %m");
968                 return -errno;
969         }
970
971         return 0;
972 }
973
974 static int setup_volatile(const char *directory) {
975         bool tmpfs_mounted = false, bind_mounted = false;
976         char template[] = "/tmp/nspawn-volatile-XXXXXX";
977         const char *f, *t;
978         int r;
979
980         assert(directory);
981
982         if (arg_volatile != VOLATILE_YES)
983                 return 0;
984
985         /* --volatile=yes means we mount a tmpfs to the root dir, and
986            the original /usr to use inside it, and that read-only. */
987
988         if (!mkdtemp(template)) {
989                 log_error("Failed to create temporary directory: %m");
990                 return -errno;
991         }
992
993         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
994                 log_error("Failed to mount tmpfs for root directory: %m");
995                 r = -errno;
996                 goto fail;
997         }
998
999         tmpfs_mounted = true;
1000
1001         f = strappenda(directory, "/usr");
1002         t = strappenda(template, "/usr");
1003
1004         r = mkdir(t, 0755);
1005         if (r < 0 && errno != EEXIST) {
1006                 log_error("Failed to create %s: %m", t);
1007                 r = -errno;
1008                 goto fail;
1009         }
1010
1011         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1012                 log_error("Failed to create /usr bind mount: %m");
1013                 r = -errno;
1014                 goto fail;
1015         }
1016
1017         bind_mounted = true;
1018
1019         r = bind_remount_recursive(t, true);
1020         if (r < 0) {
1021                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1022                 goto fail;
1023         }
1024
1025         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1026                 log_error("Failed to move root mount: %m");
1027                 r = -errno;
1028                 goto fail;
1029         }
1030
1031         rmdir(template);
1032
1033         return 0;
1034
1035 fail:
1036         if (bind_mounted)
1037                 umount(t);
1038         if (tmpfs_mounted)
1039                 umount(template);
1040         rmdir(template);
1041         return r;
1042 }
1043
1044 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1045
1046         snprintf(s, 37,
1047                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1048                  SD_ID128_FORMAT_VAL(id));
1049
1050         return s;
1051 }
1052
1053 static int setup_boot_id(const char *dest) {
1054         _cleanup_free_ char *from = NULL, *to = NULL;
1055         sd_id128_t rnd = {};
1056         char as_uuid[37];
1057         int r;
1058
1059         assert(dest);
1060
1061         if (arg_share_system)
1062                 return 0;
1063
1064         /* Generate a new randomized boot ID, so that each boot-up of
1065          * the container gets a new one */
1066
1067         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1068         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1069         if (!from || !to)
1070                 return log_oom();
1071
1072         r = sd_id128_randomize(&rnd);
1073         if (r < 0) {
1074                 log_error("Failed to generate random boot id: %s", strerror(-r));
1075                 return r;
1076         }
1077
1078         id128_format_as_uuid(rnd, as_uuid);
1079
1080         r = write_string_file(from, as_uuid);
1081         if (r < 0) {
1082                 log_error("Failed to write boot id: %s", strerror(-r));
1083                 return r;
1084         }
1085
1086         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1087                 log_error("Failed to bind mount boot id: %m");
1088                 r = -errno;
1089         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1090                 log_warning("Failed to make boot id read-only: %m");
1091
1092         unlink(from);
1093         return r;
1094 }
1095
1096 static int copy_devnodes(const char *dest) {
1097
1098         static const char devnodes[] =
1099                 "null\0"
1100                 "zero\0"
1101                 "full\0"
1102                 "random\0"
1103                 "urandom\0"
1104                 "tty\0"
1105                 "net/tun\0";
1106
1107         const char *d;
1108         int r = 0;
1109         _cleanup_umask_ mode_t u;
1110
1111         assert(dest);
1112
1113         u = umask(0000);
1114
1115         NULSTR_FOREACH(d, devnodes) {
1116                 _cleanup_free_ char *from = NULL, *to = NULL;
1117                 struct stat st;
1118
1119                 from = strappend("/dev/", d);
1120                 to = strjoin(dest, "/dev/", d, NULL);
1121                 if (!from || !to)
1122                         return log_oom();
1123
1124                 if (stat(from, &st) < 0) {
1125
1126                         if (errno != ENOENT) {
1127                                 log_error("Failed to stat %s: %m", from);
1128                                 return -errno;
1129                         }
1130
1131                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1132
1133                         log_error("%s is not a char or block device, cannot copy", from);
1134                         return -EIO;
1135
1136                 } else {
1137                         r = mkdir_parents(to, 0775);
1138                         if (r < 0) {
1139                                 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1140                                 return -r;
1141                         }
1142
1143                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144                                 log_error("mknod(%s) failed: %m", dest);
1145                                 return  -errno;
1146                         }
1147                 }
1148         }
1149
1150         return r;
1151 }
1152
1153 static int setup_ptmx(const char *dest) {
1154         _cleanup_free_ char *p = NULL;
1155
1156         p = strappend(dest, "/dev/ptmx");
1157         if (!p)
1158                 return log_oom();
1159
1160         if (symlink("pts/ptmx", p) < 0) {
1161                 log_error("Failed to create /dev/ptmx symlink: %m");
1162                 return -errno;
1163         }
1164
1165         return 0;
1166 }
1167
1168 static int setup_dev_console(const char *dest, const char *console) {
1169         _cleanup_umask_ mode_t u;
1170         const char *to;
1171         struct stat st;
1172         int r;
1173
1174         assert(dest);
1175         assert(console);
1176
1177         u = umask(0000);
1178
1179         if (stat("/dev/null", &st) < 0) {
1180                 log_error("Failed to stat /dev/null: %m");
1181                 return -errno;
1182         }
1183
1184         r = chmod_and_chown(console, 0600, 0, 0);
1185         if (r < 0) {
1186                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1187                 return r;
1188         }
1189
1190         /* We need to bind mount the right tty to /dev/console since
1191          * ptys can only exist on pts file systems. To have something
1192          * to bind mount things on we create a device node first, and
1193          * use /dev/null for that since we the cgroups device policy
1194          * allows us to create that freely, while we cannot create
1195          * /dev/console. (Note that the major minor doesn't actually
1196          * matter here, since we mount it over anyway). */
1197
1198         to = strappenda(dest, "/dev/console");
1199         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1200                 log_error("mknod() for /dev/console failed: %m");
1201                 return -errno;
1202         }
1203
1204         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1205                 log_error("Bind mount for /dev/console failed: %m");
1206                 return -errno;
1207         }
1208
1209         return 0;
1210 }
1211
1212 static int setup_kmsg(const char *dest, int kmsg_socket) {
1213         _cleanup_free_ char *from = NULL, *to = NULL;
1214         int r, fd, k;
1215         _cleanup_umask_ mode_t u;
1216         union {
1217                 struct cmsghdr cmsghdr;
1218                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1219         } control = {};
1220         struct msghdr mh = {
1221                 .msg_control = &control,
1222                 .msg_controllen = sizeof(control),
1223         };
1224         struct cmsghdr *cmsg;
1225
1226         assert(dest);
1227         assert(kmsg_socket >= 0);
1228
1229         u = umask(0000);
1230
1231         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1232          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1233          * on the reading side behave very similar to /proc/kmsg,
1234          * their writing side behaves differently from /dev/kmsg in
1235          * that writing blocks when nothing is reading. In order to
1236          * avoid any problems with containers deadlocking due to this
1237          * we simply make /dev/kmsg unavailable to the container. */
1238         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1239             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1240                 return log_oom();
1241
1242         if (mkfifo(from, 0600) < 0) {
1243                 log_error("mkfifo() for /dev/kmsg failed: %m");
1244                 return -errno;
1245         }
1246
1247         r = chmod_and_chown(from, 0600, 0, 0);
1248         if (r < 0) {
1249                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1250                 return r;
1251         }
1252
1253         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1254                 log_error("Bind mount for /proc/kmsg failed: %m");
1255                 return -errno;
1256         }
1257
1258         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1259         if (fd < 0) {
1260                 log_error("Failed to open fifo: %m");
1261                 return -errno;
1262         }
1263
1264         cmsg = CMSG_FIRSTHDR(&mh);
1265         cmsg->cmsg_level = SOL_SOCKET;
1266         cmsg->cmsg_type = SCM_RIGHTS;
1267         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1268         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1269
1270         mh.msg_controllen = cmsg->cmsg_len;
1271
1272         /* Store away the fd in the socket, so that it stays open as
1273          * long as we run the child */
1274         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1275         safe_close(fd);
1276
1277         if (k < 0) {
1278                 log_error("Failed to send FIFO fd: %m");
1279                 return -errno;
1280         }
1281
1282         /* And now make the FIFO unavailable as /dev/kmsg... */
1283         unlink(from);
1284         return 0;
1285 }
1286
1287 static int setup_hostname(void) {
1288
1289         if (arg_share_system)
1290                 return 0;
1291
1292         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1293                 return -errno;
1294
1295         return 0;
1296 }
1297
1298 static int setup_journal(const char *directory) {
1299         sd_id128_t machine_id, this_id;
1300         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1301         char *id;
1302         int r;
1303
1304         p = strappend(directory, "/etc/machine-id");
1305         if (!p)
1306                 return log_oom();
1307
1308         r = read_one_line_file(p, &b);
1309         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1310                 return 0;
1311         else if (r < 0) {
1312                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1313                 return r;
1314         }
1315
1316         id = strstrip(b);
1317         if (isempty(id) && arg_link_journal == LINK_AUTO)
1318                 return 0;
1319
1320         /* Verify validity */
1321         r = sd_id128_from_string(id, &machine_id);
1322         if (r < 0) {
1323                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1324                 return r;
1325         }
1326
1327         r = sd_id128_get_machine(&this_id);
1328         if (r < 0) {
1329                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1330                 return r;
1331         }
1332
1333         if (sd_id128_equal(machine_id, this_id)) {
1334                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335                          "Host and machine ids are equal (%s): refusing to link journals", id);
1336                 if (arg_link_journal == LINK_AUTO)
1337                         return 0;
1338                 return
1339                         -EEXIST;
1340         }
1341
1342         if (arg_link_journal == LINK_NO)
1343                 return 0;
1344
1345         free(p);
1346         p = strappend("/var/log/journal/", id);
1347         q = strjoin(directory, "/var/log/journal/", id, NULL);
1348         if (!p || !q)
1349                 return log_oom();
1350
1351         if (path_is_mount_point(p, false) > 0) {
1352                 if (arg_link_journal != LINK_AUTO) {
1353                         log_error("%s: already a mount point, refusing to use for journal", p);
1354                         return -EEXIST;
1355                 }
1356
1357                 return 0;
1358         }
1359
1360         if (path_is_mount_point(q, false) > 0) {
1361                 if (arg_link_journal != LINK_AUTO) {
1362                         log_error("%s: already a mount point, refusing to use for journal", q);
1363                         return -EEXIST;
1364                 }
1365
1366                 return 0;
1367         }
1368
1369         r = readlink_and_make_absolute(p, &d);
1370         if (r >= 0) {
1371                 if ((arg_link_journal == LINK_GUEST ||
1372                      arg_link_journal == LINK_AUTO) &&
1373                     path_equal(d, q)) {
1374
1375                         r = mkdir_p(q, 0755);
1376                         if (r < 0)
1377                                 log_warning("Failed to create directory %s: %m", q);
1378                         return 0;
1379                 }
1380
1381                 if (unlink(p) < 0) {
1382                         log_error("Failed to remove symlink %s: %m", p);
1383                         return -errno;
1384                 }
1385         } else if (r == -EINVAL) {
1386
1387                 if (arg_link_journal == LINK_GUEST &&
1388                     rmdir(p) < 0) {
1389
1390                         if (errno == ENOTDIR) {
1391                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1392                                 return r;
1393                         } else {
1394                                 log_error("Failed to remove %s: %m", p);
1395                                 return -errno;
1396                         }
1397                 }
1398         } else if (r != -ENOENT) {
1399                 log_error("readlink(%s) failed: %m", p);
1400                 return r;
1401         }
1402
1403         if (arg_link_journal == LINK_GUEST) {
1404
1405                 if (symlink(q, p) < 0) {
1406                         log_error("Failed to symlink %s to %s: %m", q, p);
1407                         return -errno;
1408                 }
1409
1410                 r = mkdir_p(q, 0755);
1411                 if (r < 0)
1412                         log_warning("Failed to create directory %s: %m", q);
1413                 return 0;
1414         }
1415
1416         if (arg_link_journal == LINK_HOST) {
1417                 r = mkdir_p(p, 0755);
1418                 if (r < 0) {
1419                         log_error("Failed to create %s: %m", p);
1420                         return r;
1421                 }
1422
1423         } else if (access(p, F_OK) < 0)
1424                 return 0;
1425
1426         if (dir_is_empty(q) == 0)
1427                 log_warning("%s is not empty, proceeding anyway.", q);
1428
1429         r = mkdir_p(q, 0755);
1430         if (r < 0) {
1431                 log_error("Failed to create %s: %m", q);
1432                 return r;
1433         }
1434
1435         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1436                 log_error("Failed to bind mount journal from host into guest: %m");
1437                 return -errno;
1438         }
1439
1440         return 0;
1441 }
1442
1443 static int setup_kdbus(const char *dest, const char *path) {
1444         const char *p;
1445
1446         if (!path)
1447                 return 0;
1448
1449         p = strappenda(dest, "/dev/kdbus");
1450         if (mkdir(p, 0755) < 0) {
1451                 log_error("Failed to create kdbus path: %m");
1452                 return  -errno;
1453         }
1454
1455         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1456                 log_error("Failed to mount kdbus domain path: %m");
1457                 return -errno;
1458         }
1459
1460         return 0;
1461 }
1462
1463 static int drop_capabilities(void) {
1464         return capability_bounding_set_drop(~arg_retain, false);
1465 }
1466
1467 static int register_machine(pid_t pid, int local_ifindex) {
1468         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1469         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1470         int r;
1471
1472         if (!arg_register)
1473                 return 0;
1474
1475         r = sd_bus_default_system(&bus);
1476         if (r < 0) {
1477                 log_error("Failed to open system bus: %s", strerror(-r));
1478                 return r;
1479         }
1480
1481         if (arg_keep_unit) {
1482                 r = sd_bus_call_method(
1483                                 bus,
1484                                 "org.freedesktop.machine1",
1485                                 "/org/freedesktop/machine1",
1486                                 "org.freedesktop.machine1.Manager",
1487                                 "RegisterMachineWithNetwork",
1488                                 &error,
1489                                 NULL,
1490                                 "sayssusai",
1491                                 arg_machine,
1492                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1493                                 "nspawn",
1494                                 "container",
1495                                 (uint32_t) pid,
1496                                 strempty(arg_directory),
1497                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1498         } else {
1499                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1500
1501                 r = sd_bus_message_new_method_call(
1502                                 bus,
1503                                 &m,
1504                                 "org.freedesktop.machine1",
1505                                 "/org/freedesktop/machine1",
1506                                 "org.freedesktop.machine1.Manager",
1507                                 "CreateMachineWithNetwork");
1508                 if (r < 0) {
1509                         log_error("Failed to create message: %s", strerror(-r));
1510                         return r;
1511                 }
1512
1513                 r = sd_bus_message_append(
1514                                 m,
1515                                 "sayssusai",
1516                                 arg_machine,
1517                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1518                                 "nspawn",
1519                                 "container",
1520                                 (uint32_t) pid,
1521                                 strempty(arg_directory),
1522                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1523                 if (r < 0) {
1524                         log_error("Failed to append message arguments: %s", strerror(-r));
1525                         return r;
1526                 }
1527
1528                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529                 if (r < 0) {
1530                         log_error("Failed to open container: %s", strerror(-r));
1531                         return r;
1532                 }
1533
1534                 if (!isempty(arg_slice)) {
1535                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1536                         if (r < 0) {
1537                                 log_error("Failed to append slice: %s", strerror(-r));
1538                                 return r;
1539                         }
1540                 }
1541
1542                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1543                 if (r < 0) {
1544                         log_error("Failed to add device policy: %s", strerror(-r));
1545                         return r;
1546                 }
1547
1548                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1549                                           /* Allow the container to
1550                                            * access and create the API
1551                                            * device nodes, so that
1552                                            * PrivateDevices= in the
1553                                            * container can work
1554                                            * fine */
1555                                           "/dev/null", "rwm",
1556                                           "/dev/zero", "rwm",
1557                                           "/dev/full", "rwm",
1558                                           "/dev/random", "rwm",
1559                                           "/dev/urandom", "rwm",
1560                                           "/dev/tty", "rwm",
1561                                           "/dev/net/tun", "rwm",
1562                                           /* Allow the container
1563                                            * access to ptys. However,
1564                                            * do not permit the
1565                                            * container to ever create
1566                                            * these device nodes. */
1567                                           "/dev/pts/ptmx", "rw",
1568                                           "char-pts", "rw",
1569                                           /* Allow the container
1570                                            * access to all kdbus
1571                                            * devices. Again, the
1572                                            * container cannot create
1573                                            * these nodes, only use
1574                                            * them. We use a pretty
1575                                            * open match here, so that
1576                                            * the kernel API can still
1577                                            * change. */
1578                                           "char-kdbus", "rw",
1579                                           "char-kdbus/*", "rw");
1580                 if (r < 0) {
1581                         log_error("Failed to add device whitelist: %s", strerror(-r));
1582                         return r;
1583                 }
1584
1585                 r = sd_bus_message_close_container(m);
1586                 if (r < 0) {
1587                         log_error("Failed to close container: %s", strerror(-r));
1588                         return r;
1589                 }
1590
1591                 r = sd_bus_call(bus, m, 0, &error, NULL);
1592         }
1593
1594         if (r < 0) {
1595                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1596                 return r;
1597         }
1598
1599         return 0;
1600 }
1601
1602 static int terminate_machine(pid_t pid) {
1603         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1604         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1605         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1606         const char *path;
1607         int r;
1608
1609         if (!arg_register)
1610                 return 0;
1611
1612         r = sd_bus_default_system(&bus);
1613         if (r < 0) {
1614                 log_error("Failed to open system bus: %s", strerror(-r));
1615                 return r;
1616         }
1617
1618         r = sd_bus_call_method(
1619                         bus,
1620                         "org.freedesktop.machine1",
1621                         "/org/freedesktop/machine1",
1622                         "org.freedesktop.machine1.Manager",
1623                         "GetMachineByPID",
1624                         &error,
1625                         &reply,
1626                         "u",
1627                         (uint32_t) pid);
1628         if (r < 0) {
1629                 /* Note that the machine might already have been
1630                  * cleaned up automatically, hence don't consider it a
1631                  * failure if we cannot get the machine object. */
1632                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1633                 return 0;
1634         }
1635
1636         r = sd_bus_message_read(reply, "o", &path);
1637         if (r < 0)
1638                 return bus_log_parse_error(r);
1639
1640         r = sd_bus_call_method(
1641                         bus,
1642                         "org.freedesktop.machine1",
1643                         path,
1644                         "org.freedesktop.machine1.Machine",
1645                         "Terminate",
1646                         &error,
1647                         NULL,
1648                         NULL);
1649         if (r < 0) {
1650                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1651                 return 0;
1652         }
1653
1654         return 0;
1655 }
1656
1657 static int reset_audit_loginuid(void) {
1658         _cleanup_free_ char *p = NULL;
1659         int r;
1660
1661         if (arg_share_system)
1662                 return 0;
1663
1664         r = read_one_line_file("/proc/self/loginuid", &p);
1665         if (r == -ENOENT)
1666                 return 0;
1667         if (r < 0) {
1668                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1669                 return r;
1670         }
1671
1672         /* Already reset? */
1673         if (streq(p, "4294967295"))
1674                 return 0;
1675
1676         r = write_string_file("/proc/self/loginuid", "4294967295");
1677         if (r < 0) {
1678                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1679                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1680                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1681                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1682                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1683
1684                 sleep(5);
1685         }
1686
1687         return 0;
1688 }
1689
1690 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1691 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1692
1693 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1694         int r;
1695
1696         uint8_t result[8];
1697         size_t l, sz;
1698         uint8_t *v;
1699
1700         l = strlen(arg_machine);
1701         sz = sizeof(sd_id128_t) + l;
1702         v = alloca(sz);
1703
1704         /* fetch some persistent data unique to the host */
1705         r = sd_id128_get_machine((sd_id128_t*) v);
1706         if (r < 0)
1707                 return r;
1708
1709         /* combine with some data unique (on this host) to this
1710          * container instance */
1711         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1712
1713         /* Let's hash the host machine ID plus the container name. We
1714          * use a fixed, but originally randomly created hash key here. */
1715         siphash24(result, v, sz, hash_key.bytes);
1716
1717         assert_cc(ETH_ALEN <= sizeof(result));
1718         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1719
1720         /* see eth_random_addr in the kernel */
1721         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1722         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1723
1724         return 0;
1725 }
1726
1727 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1728         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1729         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730         struct ether_addr mac_host, mac_container;
1731         int r, i;
1732
1733         if (!arg_private_network)
1734                 return 0;
1735
1736         if (!arg_network_veth)
1737                 return 0;
1738
1739         /* Use two different interface name prefixes depending whether
1740          * we are in bridge mode or not. */
1741         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1742                  arg_network_bridge ? "vb" : "ve", arg_machine);
1743
1744         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1745         if (r < 0) {
1746                 log_error("Failed to generate predictable MAC address for container side");
1747                 return r;
1748         }
1749
1750         r = generate_mac(&mac_host, HOST_HASH_KEY);
1751         if (r < 0) {
1752                 log_error("Failed to generate predictable MAC address for host side");
1753                 return r;
1754         }
1755
1756         r = sd_rtnl_open(&rtnl, 0);
1757         if (r < 0) {
1758                 log_error("Failed to connect to netlink: %s", strerror(-r));
1759                 return r;
1760         }
1761
1762         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1763         if (r < 0) {
1764                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1765                 return r;
1766         }
1767
1768         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1769         if (r < 0) {
1770                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1771                 return r;
1772         }
1773
1774         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1775         if (r < 0) {
1776                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1777                 return r;
1778         }
1779
1780         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1781         if (r < 0) {
1782                 log_error("Failed to open netlink container: %s", strerror(-r));
1783                 return r;
1784         }
1785
1786         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1787         if (r < 0) {
1788                 log_error("Failed to open netlink container: %s", strerror(-r));
1789                 return r;
1790         }
1791
1792         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1793         if (r < 0) {
1794                 log_error("Failed to open netlink container: %s", strerror(-r));
1795                 return r;
1796         }
1797
1798         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1799         if (r < 0) {
1800                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1801                 return r;
1802         }
1803
1804         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1805         if (r < 0) {
1806                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1807                 return r;
1808         }
1809
1810         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1811         if (r < 0) {
1812                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1813                 return r;
1814         }
1815
1816         r = sd_rtnl_message_close_container(m);
1817         if (r < 0) {
1818                 log_error("Failed to close netlink container: %s", strerror(-r));
1819                 return r;
1820         }
1821
1822         r = sd_rtnl_message_close_container(m);
1823         if (r < 0) {
1824                 log_error("Failed to close netlink container: %s", strerror(-r));
1825                 return r;
1826         }
1827
1828         r = sd_rtnl_message_close_container(m);
1829         if (r < 0) {
1830                 log_error("Failed to close netlink container: %s", strerror(-r));
1831                 return r;
1832         }
1833
1834         r = sd_rtnl_call(rtnl, m, 0, NULL);
1835         if (r < 0) {
1836                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1837                 return r;
1838         }
1839
1840         i = (int) if_nametoindex(iface_name);
1841         if (i <= 0) {
1842                 log_error("Failed to resolve interface %s: %m", iface_name);
1843                 return -errno;
1844         }
1845
1846         *ifi = i;
1847
1848         return 0;
1849 }
1850
1851 static int setup_bridge(const char veth_name[], int *ifi) {
1852         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1853         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1854         int r, bridge;
1855
1856         if (!arg_private_network)
1857                 return 0;
1858
1859         if (!arg_network_veth)
1860                 return 0;
1861
1862         if (!arg_network_bridge)
1863                 return 0;
1864
1865         bridge = (int) if_nametoindex(arg_network_bridge);
1866         if (bridge <= 0) {
1867                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1868                 return -errno;
1869         }
1870
1871         *ifi = bridge;
1872
1873         r = sd_rtnl_open(&rtnl, 0);
1874         if (r < 0) {
1875                 log_error("Failed to connect to netlink: %s", strerror(-r));
1876                 return r;
1877         }
1878
1879         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1880         if (r < 0) {
1881                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1882                 return r;
1883         }
1884
1885         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1886         if (r < 0) {
1887                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1888                 return r;
1889         }
1890
1891         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1892         if (r < 0) {
1893                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1894                 return r;
1895         }
1896
1897         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1898         if (r < 0) {
1899                 log_error("Failed to add netlink master field: %s", strerror(-r));
1900                 return r;
1901         }
1902
1903         r = sd_rtnl_call(rtnl, m, 0, NULL);
1904         if (r < 0) {
1905                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1906                 return r;
1907         }
1908
1909         return 0;
1910 }
1911
1912 static int parse_interface(struct udev *udev, const char *name) {
1913         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1914         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1915         int ifi;
1916
1917         ifi = (int) if_nametoindex(name);
1918         if (ifi <= 0) {
1919                 log_error("Failed to resolve interface %s: %m", name);
1920                 return -errno;
1921         }
1922
1923         sprintf(ifi_str, "n%i", ifi);
1924         d = udev_device_new_from_device_id(udev, ifi_str);
1925         if (!d) {
1926                 log_error("Failed to get udev device for interface %s: %m", name);
1927                 return -errno;
1928         }
1929
1930         if (udev_device_get_is_initialized(d) <= 0) {
1931                 log_error("Network interface %s is not initialized yet.", name);
1932                 return -EBUSY;
1933         }
1934
1935         return ifi;
1936 }
1937
1938 static int move_network_interfaces(pid_t pid) {
1939         _cleanup_udev_unref_ struct udev *udev = NULL;
1940         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1941         char **i;
1942         int r;
1943
1944         if (!arg_private_network)
1945                 return 0;
1946
1947         if (strv_isempty(arg_network_interfaces))
1948                 return 0;
1949
1950         r = sd_rtnl_open(&rtnl, 0);
1951         if (r < 0) {
1952                 log_error("Failed to connect to netlink: %s", strerror(-r));
1953                 return r;
1954         }
1955
1956         udev = udev_new();
1957         if (!udev) {
1958                 log_error("Failed to connect to udev.");
1959                 return -ENOMEM;
1960         }
1961
1962         STRV_FOREACH(i, arg_network_interfaces) {
1963                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1964                 int ifi;
1965
1966                 ifi = parse_interface(udev, *i);
1967                 if (ifi < 0)
1968                         return ifi;
1969
1970                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1971                 if (r < 0) {
1972                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1973                         return r;
1974                 }
1975
1976                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1977                 if (r < 0) {
1978                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1979                         return r;
1980                 }
1981
1982                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1983                 if (r < 0) {
1984                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1985                         return r;
1986                 }
1987         }
1988
1989         return 0;
1990 }
1991
1992 static int setup_macvlan(pid_t pid) {
1993         _cleanup_udev_unref_ struct udev *udev = NULL;
1994         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1995         char **i;
1996         int r;
1997
1998         if (!arg_private_network)
1999                 return 0;
2000
2001         if (strv_isempty(arg_network_macvlan))
2002                 return 0;
2003
2004         r = sd_rtnl_open(&rtnl, 0);
2005         if (r < 0) {
2006                 log_error("Failed to connect to netlink: %s", strerror(-r));
2007                 return r;
2008         }
2009
2010         udev = udev_new();
2011         if (!udev) {
2012                 log_error("Failed to connect to udev.");
2013                 return -ENOMEM;
2014         }
2015
2016         STRV_FOREACH(i, arg_network_macvlan) {
2017                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2018                 _cleanup_free_ char *n = NULL;
2019                 int ifi;
2020
2021                 ifi = parse_interface(udev, *i);
2022                 if (ifi < 0)
2023                         return ifi;
2024
2025                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2026                 if (r < 0) {
2027                         log_error("Failed to allocate netlink message: %s", strerror(-r));
2028                         return r;
2029                 }
2030
2031                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2032                 if (r < 0) {
2033                         log_error("Failed to add netlink interface index: %s", strerror(-r));
2034                         return r;
2035                 }
2036
2037                 n = strappend("mv-", *i);
2038                 if (!n)
2039                         return log_oom();
2040
2041                 strshorten(n, IFNAMSIZ-1);
2042
2043                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2044                 if (r < 0) {
2045                         log_error("Failed to add netlink interface name: %s", strerror(-r));
2046                         return r;
2047                 }
2048
2049                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2050                 if (r < 0) {
2051                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
2052                         return r;
2053                 }
2054
2055                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2056                 if (r < 0) {
2057                         log_error("Failed to open netlink container: %s", strerror(-r));
2058                         return r;
2059                 }
2060
2061                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2062                 if (r < 0) {
2063                         log_error("Failed to open netlink container: %s", strerror(-r));
2064                         return r;
2065                 }
2066
2067                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2068                 if (r < 0) {
2069                         log_error("Failed to append macvlan mode: %s", strerror(-r));
2070                         return r;
2071                 }
2072
2073                 r = sd_rtnl_message_close_container(m);
2074                 if (r < 0) {
2075                         log_error("Failed to close netlink container: %s", strerror(-r));
2076                         return r;
2077                 }
2078
2079                 r = sd_rtnl_message_close_container(m);
2080                 if (r < 0) {
2081                         log_error("Failed to close netlink container: %s", strerror(-r));
2082                         return r;
2083                 }
2084
2085                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2086                 if (r < 0) {
2087                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2088                         return r;
2089                 }
2090         }
2091
2092         return 0;
2093 }
2094
2095 static int setup_seccomp(void) {
2096
2097 #ifdef HAVE_SECCOMP
2098         static const int blacklist[] = {
2099                 SCMP_SYS(kexec_load),
2100                 SCMP_SYS(open_by_handle_at),
2101                 SCMP_SYS(init_module),
2102                 SCMP_SYS(finit_module),
2103                 SCMP_SYS(delete_module),
2104                 SCMP_SYS(iopl),
2105                 SCMP_SYS(ioperm),
2106                 SCMP_SYS(swapon),
2107                 SCMP_SYS(swapoff),
2108         };
2109
2110         scmp_filter_ctx seccomp;
2111         unsigned i;
2112         int r;
2113
2114         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2115         if (!seccomp)
2116                 return log_oom();
2117
2118         r = seccomp_add_secondary_archs(seccomp);
2119         if (r < 0) {
2120                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2121                 goto finish;
2122         }
2123
2124         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2125                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2126                 if (r == -EFAULT)
2127                         continue; /* unknown syscall */
2128                 if (r < 0) {
2129                         log_error("Failed to block syscall: %s", strerror(-r));
2130                         goto finish;
2131                 }
2132         }
2133
2134         /*
2135            Audit is broken in containers, much of the userspace audit
2136            hookup will fail if running inside a container. We don't
2137            care and just turn off creation of audit sockets.
2138
2139            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2140            with EAFNOSUPPORT which audit userspace uses as indication
2141            that audit is disabled in the kernel.
2142          */
2143
2144         r = seccomp_rule_add(
2145                         seccomp,
2146                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2147                         SCMP_SYS(socket),
2148                         2,
2149                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2150                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2151         if (r < 0) {
2152                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2153                 goto finish;
2154         }
2155
2156         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2157         if (r < 0) {
2158                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2159                 goto finish;
2160         }
2161
2162         r = seccomp_load(seccomp);
2163         if (r < 0)
2164                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2165
2166 finish:
2167         seccomp_release(seccomp);
2168         return r;
2169 #else
2170         return 0;
2171 #endif
2172
2173 }
2174
2175 static int setup_image(char **device_path, int *loop_nr) {
2176         struct loop_info64 info = {
2177                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2178         };
2179         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2180         _cleanup_free_ char* loopdev = NULL;
2181         struct stat st;
2182         int r, nr;
2183
2184         assert(device_path);
2185         assert(loop_nr);
2186
2187         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2188         if (fd < 0) {
2189                 log_error("Failed to open %s: %m", arg_image);
2190                 return -errno;
2191         }
2192
2193         if (fstat(fd, &st) < 0) {
2194                 log_error("Failed to stat %s: %m", arg_image);
2195                 return -errno;
2196         }
2197
2198         if (S_ISBLK(st.st_mode)) {
2199                 char *p;
2200
2201                 p = strdup(arg_image);
2202                 if (!p)
2203                         return log_oom();
2204
2205                 *device_path = p;
2206
2207                 *loop_nr = -1;
2208
2209                 r = fd;
2210                 fd = -1;
2211
2212                 return r;
2213         }
2214
2215         if (!S_ISREG(st.st_mode)) {
2216                 log_error("%s is not a regular file or block device: %m", arg_image);
2217                 return -EINVAL;
2218         }
2219
2220         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2221         if (control < 0) {
2222                 log_error("Failed to open /dev/loop-control: %m");
2223                 return -errno;
2224         }
2225
2226         nr = ioctl(control, LOOP_CTL_GET_FREE);
2227         if (nr < 0) {
2228                 log_error("Failed to allocate loop device: %m");
2229                 return -errno;
2230         }
2231
2232         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2233                 return log_oom();
2234
2235         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2236         if (loop < 0) {
2237                 log_error("Failed to open loop device %s: %m", loopdev);
2238                 return -errno;
2239         }
2240
2241         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2242                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2243                 return -errno;
2244         }
2245
2246         if (arg_read_only)
2247                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2248
2249         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2250                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2251                 return -errno;
2252         }
2253
2254         *device_path = loopdev;
2255         loopdev = NULL;
2256
2257         *loop_nr = nr;
2258
2259         r = loop;
2260         loop = -1;
2261
2262         return r;
2263 }
2264
2265 static int dissect_image(
2266                 int fd,
2267                 char **root_device, bool *root_device_rw,
2268                 char **home_device, bool *home_device_rw,
2269                 char **srv_device, bool *srv_device_rw,
2270                 bool *secondary) {
2271
2272 #ifdef HAVE_BLKID
2273         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2274         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2275         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2276         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2277         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2278         _cleanup_udev_unref_ struct udev *udev = NULL;
2279         struct udev_list_entry *first, *item;
2280         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2281         const char *pttype = NULL;
2282         blkid_partlist pl;
2283         struct stat st;
2284         int r;
2285
2286         assert(fd >= 0);
2287         assert(root_device);
2288         assert(home_device);
2289         assert(srv_device);
2290         assert(secondary);
2291
2292         b = blkid_new_probe();
2293         if (!b)
2294                 return log_oom();
2295
2296         errno = 0;
2297         r = blkid_probe_set_device(b, fd, 0, 0);
2298         if (r != 0) {
2299                 if (errno == 0)
2300                         return log_oom();
2301
2302                 log_error("Failed to set device on blkid probe: %m");
2303                 return -errno;
2304         }
2305
2306         blkid_probe_enable_partitions(b, 1);
2307         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2308
2309         errno = 0;
2310         r = blkid_do_safeprobe(b);
2311         if (r == -2 || r == 1) {
2312                 log_error("Failed to identify any partition table on %s.\n"
2313                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2314                 return -EINVAL;
2315         } else if (r != 0) {
2316                 if (errno == 0)
2317                         errno = EIO;
2318                 log_error("Failed to probe: %m");
2319                 return -errno;
2320         }
2321
2322         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2323         if (!streq_ptr(pttype, "gpt")) {
2324                 log_error("Image %s does not carry a GUID Partition Table.\n"
2325                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2326                 return -EINVAL;
2327         }
2328
2329         errno = 0;
2330         pl = blkid_probe_get_partitions(b);
2331         if (!pl) {
2332                 if (errno == 0)
2333                         return log_oom();
2334
2335                 log_error("Failed to list partitions of %s", arg_image);
2336                 return -errno;
2337         }
2338
2339         udev = udev_new();
2340         if (!udev)
2341                 return log_oom();
2342
2343         if (fstat(fd, &st) < 0) {
2344                 log_error("Failed to stat block device: %m");
2345                 return -errno;
2346         }
2347
2348         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2349         if (!d)
2350                 return log_oom();
2351
2352         e = udev_enumerate_new(udev);
2353         if (!e)
2354                 return log_oom();
2355
2356         r = udev_enumerate_add_match_parent(e, d);
2357         if (r < 0)
2358                 return log_oom();
2359
2360         r = udev_enumerate_scan_devices(e);
2361         if (r < 0) {
2362                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2363                 return r;
2364         }
2365
2366         first = udev_enumerate_get_list_entry(e);
2367         udev_list_entry_foreach(item, first) {
2368                 _cleanup_udev_device_unref_ struct udev_device *q;
2369                 const char *stype, *node;
2370                 unsigned long long flags;
2371                 sd_id128_t type_id;
2372                 blkid_partition pp;
2373                 dev_t qn;
2374                 int nr;
2375
2376                 errno = 0;
2377                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2378                 if (!q) {
2379                         if (!errno)
2380                                 errno = ENOMEM;
2381
2382                         log_error("Failed to get partition device of %s: %m", arg_image);
2383                         return -errno;
2384                 }
2385
2386                 qn = udev_device_get_devnum(q);
2387                 if (major(qn) == 0)
2388                         continue;
2389
2390                 if (st.st_rdev == qn)
2391                         continue;
2392
2393                 node = udev_device_get_devnode(q);
2394                 if (!node)
2395                         continue;
2396
2397                 pp = blkid_partlist_devno_to_partition(pl, qn);
2398                 if (!pp)
2399                         continue;
2400
2401                 flags = blkid_partition_get_flags(pp);
2402                 if (flags & GPT_FLAG_NO_AUTO)
2403                         continue;
2404
2405                 nr = blkid_partition_get_partno(pp);
2406                 if (nr < 0)
2407                         continue;
2408
2409                 stype = blkid_partition_get_type_string(pp);
2410                 if (!stype)
2411                         continue;
2412
2413                 if (sd_id128_from_string(stype, &type_id) < 0)
2414                         continue;
2415
2416                 if (sd_id128_equal(type_id, GPT_HOME)) {
2417
2418                         if (home && nr >= home_nr)
2419                                 continue;
2420
2421                         home_nr = nr;
2422                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2423
2424                         free(home);
2425                         home = strdup(node);
2426                         if (!home)
2427                                 return log_oom();
2428                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2429
2430                         if (srv && nr >= srv_nr)
2431                                 continue;
2432
2433                         srv_nr = nr;
2434                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2435
2436                         free(srv);
2437                         srv = strdup(node);
2438                         if (!srv)
2439                                 return log_oom();
2440                 }
2441 #ifdef GPT_ROOT_NATIVE
2442                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2443
2444                         if (root && nr >= root_nr)
2445                                 continue;
2446
2447                         root_nr = nr;
2448                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2449
2450                         free(root);
2451                         root = strdup(node);
2452                         if (!root)
2453                                 return log_oom();
2454                 }
2455 #endif
2456 #ifdef GPT_ROOT_SECONDARY
2457                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2458
2459                         if (secondary_root && nr >= secondary_root_nr)
2460                                 continue;
2461
2462                         secondary_root_nr = nr;
2463                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2464
2465
2466                         free(secondary_root);
2467                         secondary_root = strdup(node);
2468                         if (!secondary_root)
2469                                 return log_oom();
2470                 }
2471 #endif
2472         }
2473
2474         if (!root && !secondary_root) {
2475                 log_error("Failed to identify root partition in disk image %s.\n"
2476                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2477                 return -EINVAL;
2478         }
2479
2480         if (root) {
2481                 *root_device = root;
2482                 root = NULL;
2483
2484                 *root_device_rw = root_rw;
2485                 *secondary = false;
2486         } else if (secondary_root) {
2487                 *root_device = secondary_root;
2488                 secondary_root = NULL;
2489
2490                 *root_device_rw = secondary_root_rw;
2491                 *secondary = true;
2492         }
2493
2494         if (home) {
2495                 *home_device = home;
2496                 home = NULL;
2497
2498                 *home_device_rw = home_rw;
2499         }
2500
2501         if (srv) {
2502                 *srv_device = srv;
2503                 srv = NULL;
2504
2505                 *srv_device_rw = srv_rw;
2506         }
2507
2508         return 0;
2509 #else
2510         log_error("--image= is not supported, compiled without blkid support.");
2511         return -ENOTSUP;
2512 #endif
2513 }
2514
2515 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2516 #ifdef HAVE_BLKID
2517         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2518         const char *fstype, *p;
2519         int r;
2520
2521         assert(what);
2522         assert(where);
2523
2524         if (arg_read_only)
2525                 rw = false;
2526
2527         if (directory)
2528                 p = strappenda(where, directory);
2529         else
2530                 p = where;
2531
2532         errno = 0;
2533         b = blkid_new_probe_from_filename(what);
2534         if (!b) {
2535                 if (errno == 0)
2536                         return log_oom();
2537                 log_error("Failed to allocate prober for %s: %m", what);
2538                 return -errno;
2539         }
2540
2541         blkid_probe_enable_superblocks(b, 1);
2542         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2543
2544         errno = 0;
2545         r = blkid_do_safeprobe(b);
2546         if (r == -1 || r == 1) {
2547                 log_error("Cannot determine file system type of %s", what);
2548                 return -EINVAL;
2549         } else if (r != 0) {
2550                 if (errno == 0)
2551                         errno = EIO;
2552                 log_error("Failed to probe %s: %m", what);
2553                 return -errno;
2554         }
2555
2556         errno = 0;
2557         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2558                 if (errno == 0)
2559                         errno = EINVAL;
2560                 log_error("Failed to determine file system type of %s", what);
2561                 return -errno;
2562         }
2563
2564         if (streq(fstype, "crypto_LUKS")) {
2565                 log_error("nspawn currently does not support LUKS disk images.");
2566                 return -ENOTSUP;
2567         }
2568
2569         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2570                 log_error("Failed to mount %s: %m", what);
2571                 return -errno;
2572         }
2573
2574         return 0;
2575 #else
2576         log_error("--image= is not supported, compiled without blkid support.");
2577         return -ENOTSUP;
2578 #endif
2579 }
2580
2581 static int mount_devices(
2582                 const char *where,
2583                 const char *root_device, bool root_device_rw,
2584                 const char *home_device, bool home_device_rw,
2585                 const char *srv_device, bool srv_device_rw) {
2586         int r;
2587
2588         assert(where);
2589
2590         if (root_device) {
2591                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2592                 if (r < 0) {
2593                         log_error("Failed to mount root directory: %s", strerror(-r));
2594                         return r;
2595                 }
2596         }
2597
2598         if (home_device) {
2599                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2600                 if (r < 0) {
2601                         log_error("Failed to mount home directory: %s", strerror(-r));
2602                         return r;
2603                 }
2604         }
2605
2606         if (srv_device) {
2607                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2608                 if (r < 0) {
2609                         log_error("Failed to mount server data directory: %s", strerror(-r));
2610                         return r;
2611                 }
2612         }
2613
2614         return 0;
2615 }
2616
2617 static void loop_remove(int nr, int *image_fd) {
2618         _cleanup_close_ int control = -1;
2619         int r;
2620
2621         if (nr < 0)
2622                 return;
2623
2624         if (image_fd && *image_fd >= 0) {
2625                 r = ioctl(*image_fd, LOOP_CLR_FD);
2626                 if (r < 0)
2627                         log_warning("Failed to close loop image: %m");
2628                 *image_fd = safe_close(*image_fd);
2629         }
2630
2631         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2632         if (control < 0) {
2633                 log_warning("Failed to open /dev/loop-control: %m");
2634                 return;
2635         }
2636
2637         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2638         if (r < 0)
2639                 log_warning("Failed to remove loop %d: %m", nr);
2640 }
2641
2642 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2643         int pipe_fds[2];
2644         pid_t pid;
2645
2646         assert(database);
2647         assert(key);
2648         assert(rpid);
2649
2650         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2651                 log_error("Failed to allocate pipe: %m");
2652                 return -errno;
2653         }
2654
2655         pid = fork();
2656         if (pid < 0) {
2657                 log_error("Failed to fork getent child: %m");
2658                 return -errno;
2659         } else if (pid == 0) {
2660                 int nullfd;
2661                 char *empty_env = NULL;
2662
2663                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2664                         _exit(EXIT_FAILURE);
2665
2666                 if (pipe_fds[0] > 2)
2667                         safe_close(pipe_fds[0]);
2668                 if (pipe_fds[1] > 2)
2669                         safe_close(pipe_fds[1]);
2670
2671                 nullfd = open("/dev/null", O_RDWR);
2672                 if (nullfd < 0)
2673                         _exit(EXIT_FAILURE);
2674
2675                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2676                         _exit(EXIT_FAILURE);
2677
2678                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2679                         _exit(EXIT_FAILURE);
2680
2681                 if (nullfd > 2)
2682                         safe_close(nullfd);
2683
2684                 reset_all_signal_handlers();
2685                 close_all_fds(NULL, 0);
2686
2687                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2688                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2689                 _exit(EXIT_FAILURE);
2690         }
2691
2692         pipe_fds[1] = safe_close(pipe_fds[1]);
2693
2694         *rpid = pid;
2695
2696         return pipe_fds[0];
2697 }
2698
2699 static int change_uid_gid(char **_home) {
2700         char line[LINE_MAX], *x, *u, *g, *h;
2701         const char *word, *state;
2702         _cleanup_free_ uid_t *uids = NULL;
2703         _cleanup_free_ char *home = NULL;
2704         _cleanup_fclose_ FILE *f = NULL;
2705         _cleanup_close_ int fd = -1;
2706         unsigned n_uids = 0;
2707         size_t sz = 0, l;
2708         uid_t uid;
2709         gid_t gid;
2710         pid_t pid;
2711         int r;
2712
2713         assert(_home);
2714
2715         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2716                 /* Reset everything fully to 0, just in case */
2717
2718                 if (setgroups(0, NULL) < 0) {
2719                         log_error("setgroups() failed: %m");
2720                         return -errno;
2721                 }
2722
2723                 if (setresgid(0, 0, 0) < 0) {
2724                         log_error("setregid() failed: %m");
2725                         return -errno;
2726                 }
2727
2728                 if (setresuid(0, 0, 0) < 0) {
2729                         log_error("setreuid() failed: %m");
2730                         return -errno;
2731                 }
2732
2733                 *_home = NULL;
2734                 return 0;
2735         }
2736
2737         /* First, get user credentials */
2738         fd = spawn_getent("passwd", arg_user, &pid);
2739         if (fd < 0)
2740                 return fd;
2741
2742         f = fdopen(fd, "r");
2743         if (!f)
2744                 return log_oom();
2745         fd = -1;
2746
2747         if (!fgets(line, sizeof(line), f)) {
2748
2749                 if (!ferror(f)) {
2750                         log_error("Failed to resolve user %s.", arg_user);
2751                         return -ESRCH;
2752                 }
2753
2754                 log_error("Failed to read from getent: %m");
2755                 return -errno;
2756         }
2757
2758         truncate_nl(line);
2759
2760         wait_for_terminate_and_warn("getent passwd", pid);
2761
2762         x = strchr(line, ':');
2763         if (!x) {
2764                 log_error("/etc/passwd entry has invalid user field.");
2765                 return -EIO;
2766         }
2767
2768         u = strchr(x+1, ':');
2769         if (!u) {
2770                 log_error("/etc/passwd entry has invalid password field.");
2771                 return -EIO;
2772         }
2773
2774         u++;
2775         g = strchr(u, ':');
2776         if (!g) {
2777                 log_error("/etc/passwd entry has invalid UID field.");
2778                 return -EIO;
2779         }
2780
2781         *g = 0;
2782         g++;
2783         x = strchr(g, ':');
2784         if (!x) {
2785                 log_error("/etc/passwd entry has invalid GID field.");
2786                 return -EIO;
2787         }
2788
2789         *x = 0;
2790         h = strchr(x+1, ':');
2791         if (!h) {
2792                 log_error("/etc/passwd entry has invalid GECOS field.");
2793                 return -EIO;
2794         }
2795
2796         h++;
2797         x = strchr(h, ':');
2798         if (!x) {
2799                 log_error("/etc/passwd entry has invalid home directory field.");
2800                 return -EIO;
2801         }
2802
2803         *x = 0;
2804
2805         r = parse_uid(u, &uid);
2806         if (r < 0) {
2807                 log_error("Failed to parse UID of user.");
2808                 return -EIO;
2809         }
2810
2811         r = parse_gid(g, &gid);
2812         if (r < 0) {
2813                 log_error("Failed to parse GID of user.");
2814                 return -EIO;
2815         }
2816
2817         home = strdup(h);
2818         if (!home)
2819                 return log_oom();
2820
2821         /* Second, get group memberships */
2822         fd = spawn_getent("initgroups", arg_user, &pid);
2823         if (fd < 0)
2824                 return fd;
2825
2826         fclose(f);
2827         f = fdopen(fd, "r");
2828         if (!f)
2829                 return log_oom();
2830         fd = -1;
2831
2832         if (!fgets(line, sizeof(line), f)) {
2833                 if (!ferror(f)) {
2834                         log_error("Failed to resolve user %s.", arg_user);
2835                         return -ESRCH;
2836                 }
2837
2838                 log_error("Failed to read from getent: %m");
2839                 return -errno;
2840         }
2841
2842         truncate_nl(line);
2843
2844         wait_for_terminate_and_warn("getent initgroups", pid);
2845
2846         /* Skip over the username and subsequent separator whitespace */
2847         x = line;
2848         x += strcspn(x, WHITESPACE);
2849         x += strspn(x, WHITESPACE);
2850
2851         FOREACH_WORD(word, l, x, state) {
2852                 char c[l+1];
2853
2854                 memcpy(c, word, l);
2855                 c[l] = 0;
2856
2857                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2858                         return log_oom();
2859
2860                 r = parse_uid(c, &uids[n_uids++]);
2861                 if (r < 0) {
2862                         log_error("Failed to parse group data from getent.");
2863                         return -EIO;
2864                 }
2865         }
2866
2867         r = mkdir_parents(home, 0775);
2868         if (r < 0) {
2869                 log_error("Failed to make home root directory: %s", strerror(-r));
2870                 return r;
2871         }
2872
2873         r = mkdir_safe(home, 0755, uid, gid);
2874         if (r < 0 && r != -EEXIST) {
2875                 log_error("Failed to make home directory: %s", strerror(-r));
2876                 return r;
2877         }
2878
2879         fchown(STDIN_FILENO, uid, gid);
2880         fchown(STDOUT_FILENO, uid, gid);
2881         fchown(STDERR_FILENO, uid, gid);
2882
2883         if (setgroups(n_uids, uids) < 0) {
2884                 log_error("Failed to set auxiliary groups: %m");
2885                 return -errno;
2886         }
2887
2888         if (setresgid(gid, gid, gid) < 0) {
2889                 log_error("setregid() failed: %m");
2890                 return -errno;
2891         }
2892
2893         if (setresuid(uid, uid, uid) < 0) {
2894                 log_error("setreuid() failed: %m");
2895                 return -errno;
2896         }
2897
2898         if (_home) {
2899                 *_home = home;
2900                 home = NULL;
2901         }
2902
2903         return 0;
2904 }
2905
2906 /*
2907  * Return values:
2908  * < 0 : wait_for_terminate() failed to get the state of the
2909  *       container, the container was terminated by a signal, or
2910  *       failed for an unknown reason.  No change is made to the
2911  *       container argument.
2912  * > 0 : The program executed in the container terminated with an
2913  *       error.  The exit code of the program executed in the
2914  *       container is returned.  No change is made to the container
2915  *       argument.
2916  *   0 : The container is being rebooted, has been shut down or exited
2917  *       successfully.  The container argument has been set to either
2918  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2919  *
2920  * That is, success is indicated by a return value of zero, and an
2921  * error is indicated by a non-zero value.
2922  */
2923 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2924         int r;
2925         siginfo_t status;
2926
2927         r = wait_for_terminate(pid, &status);
2928         if (r < 0) {
2929                 log_warning("Failed to wait for container: %s", strerror(-r));
2930                 return r;
2931         }
2932
2933         switch (status.si_code) {
2934         case CLD_EXITED:
2935                 r = status.si_status;
2936                 if (r == 0) {
2937                         if (!arg_quiet)
2938                                 log_debug("Container %s exited successfully.",
2939                                           arg_machine);
2940
2941                         *container = CONTAINER_TERMINATED;
2942                 } else {
2943                         log_error("Container %s failed with error code %i.",
2944                                   arg_machine, status.si_status);
2945                 }
2946                 break;
2947
2948         case CLD_KILLED:
2949                 if (status.si_status == SIGINT) {
2950                         if (!arg_quiet)
2951                                 log_info("Container %s has been shut down.",
2952                                          arg_machine);
2953
2954                         *container = CONTAINER_TERMINATED;
2955                         r = 0;
2956                         break;
2957                 } else if (status.si_status == SIGHUP) {
2958                         if (!arg_quiet)
2959                                 log_info("Container %s is being rebooted.",
2960                                          arg_machine);
2961
2962                         *container = CONTAINER_REBOOTED;
2963                         r = 0;
2964                         break;
2965                 }
2966                 /* CLD_KILLED fallthrough */
2967
2968         case CLD_DUMPED:
2969                 log_error("Container %s terminated by signal %s.",
2970                           arg_machine, signal_to_string(status.si_status));
2971                 r = -1;
2972                 break;
2973
2974         default:
2975                 log_error("Container %s failed due to unknown reason.",
2976                           arg_machine);
2977                 r = -1;
2978                 break;
2979         }
2980
2981         return r;
2982 }
2983
2984 static void nop_handler(int sig) {}
2985
2986 int main(int argc, char *argv[]) {
2987
2988         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2989         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2990         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2991         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2992         _cleanup_fdset_free_ FDSet *fds = NULL;
2993         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2994         const char *console = NULL;
2995         char veth_name[IFNAMSIZ];
2996         bool secondary = false;
2997         sigset_t mask, mask_chld;
2998         pid_t pid = 0;
2999
3000         log_parse_environment();
3001         log_open();
3002
3003         k = parse_argv(argc, argv);
3004         if (k < 0)
3005                 goto finish;
3006         else if (k == 0) {
3007                 r = EXIT_SUCCESS;
3008                 goto finish;
3009         }
3010
3011         if (!arg_image) {
3012                 if (arg_directory) {
3013                         char *p;
3014
3015                         p = path_make_absolute_cwd(arg_directory);
3016                         free(arg_directory);
3017                         arg_directory = p;
3018                 } else
3019                         arg_directory = get_current_dir_name();
3020
3021                 if (!arg_directory) {
3022                         log_error("Failed to determine path, please use -D.");
3023                         goto finish;
3024                 }
3025                 path_kill_slashes(arg_directory);
3026         }
3027
3028         if (!arg_machine) {
3029                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3030                 if (!arg_machine) {
3031                         log_oom();
3032                         goto finish;
3033                 }
3034
3035                 hostname_cleanup(arg_machine, false);
3036                 if (isempty(arg_machine)) {
3037                         log_error("Failed to determine machine name automatically, please use -M.");
3038                         goto finish;
3039                 }
3040         }
3041
3042         if (geteuid() != 0) {
3043                 log_error("Need to be root.");
3044                 goto finish;
3045         }
3046
3047         if (sd_booted() <= 0) {
3048                 log_error("Not running on a systemd system.");
3049                 goto finish;
3050         }
3051
3052         log_close();
3053         n_fd_passed = sd_listen_fds(false);
3054         if (n_fd_passed > 0) {
3055                 k = fdset_new_listen_fds(&fds, false);
3056                 if (k < 0) {
3057                         log_error("Failed to collect file descriptors: %s", strerror(-k));
3058                         goto finish;
3059                 }
3060         }
3061         fdset_close_others(fds);
3062         log_open();
3063
3064         if (arg_directory) {
3065                 if (path_equal(arg_directory, "/")) {
3066                         log_error("Spawning container on root directory not supported.");
3067                         goto finish;
3068                 }
3069
3070                 if (arg_boot) {
3071                         if (path_is_os_tree(arg_directory) <= 0) {
3072                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3073                                 goto finish;
3074                         }
3075                 } else {
3076                         const char *p;
3077
3078                         p = strappenda(arg_directory,
3079                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3080                         if (access(p, F_OK) < 0) {
3081                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3082                                 goto finish;
3083
3084                         }
3085                 }
3086         } else {
3087                 char template[] = "/tmp/nspawn-root-XXXXXX";
3088
3089                 if (!mkdtemp(template)) {
3090                         log_error("Failed to create temporary directory: %m");
3091                         r = -errno;
3092                         goto finish;
3093                 }
3094
3095                 arg_directory = strdup(template);
3096                 if (!arg_directory) {
3097                         r = log_oom();
3098                         goto finish;
3099                 }
3100
3101                 image_fd = setup_image(&device_path, &loop_nr);
3102                 if (image_fd < 0) {
3103                         r = image_fd;
3104                         goto finish;
3105                 }
3106
3107                 r = dissect_image(image_fd,
3108                                   &root_device, &root_device_rw,
3109                                   &home_device, &home_device_rw,
3110                                   &srv_device, &srv_device_rw,
3111                                   &secondary);
3112                 if (r < 0)
3113                         goto finish;
3114         }
3115
3116         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3117         if (master < 0) {
3118                 log_error("Failed to acquire pseudo tty: %m");
3119                 goto finish;
3120         }
3121
3122         console = ptsname(master);
3123         if (!console) {
3124                 log_error("Failed to determine tty name: %m");
3125                 goto finish;
3126         }
3127
3128         if (!arg_quiet)
3129                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3130                          arg_machine, arg_image ? arg_image : arg_directory);
3131
3132         if (unlockpt(master) < 0) {
3133                 log_error("Failed to unlock tty: %m");
3134                 goto finish;
3135         }
3136
3137         if (access("/dev/kdbus/control", F_OK) >= 0) {
3138
3139                 if (arg_share_system) {
3140                         kdbus_domain = strdup("/dev/kdbus");
3141                         if (!kdbus_domain) {
3142                                 log_oom();
3143                                 goto finish;
3144                         }
3145                 } else {
3146                         const char *ns;
3147
3148                         ns = strappenda("machine-", arg_machine);
3149                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3150                         if (r < 0)
3151                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3152                         else
3153                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3154                 }
3155         }
3156
3157         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3158                 log_error("Failed to create kmsg socket pair: %m");
3159                 goto finish;
3160         }
3161
3162         sd_notify(false,
3163                   "READY=1\n"
3164                   "STATUS=Container running.");
3165
3166         assert_se(sigemptyset(&mask) == 0);
3167         assert_se(sigemptyset(&mask_chld) == 0);
3168         sigaddset(&mask_chld, SIGCHLD);
3169         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3170         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3171
3172         for (;;) {
3173                 ContainerStatus container_status;
3174                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3175                 struct sigaction sa = {
3176                         .sa_handler = nop_handler,
3177                         .sa_flags = SA_NOCLDSTOP,
3178                 };
3179
3180                 r = barrier_create(&barrier);
3181                 if (r < 0) {
3182                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3183                         goto finish;
3184                 }
3185
3186                 /* Child can be killed before execv(), so handle SIGCHLD
3187                  * in order to interrupt parent's blocking calls and
3188                  * give it a chance to call wait() and terminate. */
3189                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3190                 if (r < 0) {
3191                         log_error("Failed to change the signal mask: %m");
3192                         goto finish;
3193                 }
3194
3195                 r = sigaction(SIGCHLD, &sa, NULL);
3196                 if (r < 0) {
3197                         log_error("Failed to install SIGCHLD handler: %m");
3198                         goto finish;
3199                 }
3200
3201                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3202                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3203                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3204                 if (pid < 0) {
3205                         if (errno == EINVAL)
3206                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3207                         else
3208                                 log_error("clone() failed: %m");
3209
3210                         r = pid;
3211                         goto finish;
3212                 }
3213
3214                 if (pid == 0) {
3215                         /* child */
3216                         _cleanup_free_ char *home = NULL;
3217                         unsigned n_env = 2;
3218                         const char *envp[] = {
3219                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3220                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3221                                 NULL, /* TERM */
3222                                 NULL, /* HOME */
3223                                 NULL, /* USER */
3224                                 NULL, /* LOGNAME */
3225                                 NULL, /* container_uuid */
3226                                 NULL, /* LISTEN_FDS */
3227                                 NULL, /* LISTEN_PID */
3228                                 NULL
3229                         };
3230                         char **env_use;
3231
3232                         barrier_set_role(&barrier, BARRIER_CHILD);
3233
3234                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3235                         if (envp[n_env])
3236                                 n_env ++;
3237
3238                         master = safe_close(master);
3239
3240                         close_nointr(STDIN_FILENO);
3241                         close_nointr(STDOUT_FILENO);
3242                         close_nointr(STDERR_FILENO);
3243
3244                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3245
3246                         reset_all_signal_handlers();
3247                         reset_signal_mask();
3248
3249                         k = open_terminal(console, O_RDWR);
3250                         if (k != STDIN_FILENO) {
3251                                 if (k >= 0) {
3252                                         safe_close(k);
3253                                         k = -EINVAL;
3254                                 }
3255
3256                                 log_error("Failed to open console: %s", strerror(-k));
3257                                 _exit(EXIT_FAILURE);
3258                         }
3259
3260                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3261                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3262                                 log_error("Failed to duplicate console: %m");
3263                                 _exit(EXIT_FAILURE);
3264                         }
3265
3266                         if (setsid() < 0) {
3267                                 log_error("setsid() failed: %m");
3268                                 _exit(EXIT_FAILURE);
3269                         }
3270
3271                         if (reset_audit_loginuid() < 0)
3272                                 _exit(EXIT_FAILURE);
3273
3274                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3275                                 log_error("PR_SET_PDEATHSIG failed: %m");
3276                                 _exit(EXIT_FAILURE);
3277                         }
3278
3279                         /* Mark everything as slave, so that we still
3280                          * receive mounts from the real root, but don't
3281                          * propagate mounts to the real root. */
3282                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3283                                 log_error("MS_SLAVE|MS_REC failed: %m");
3284                                 _exit(EXIT_FAILURE);
3285                         }
3286
3287                         if (mount_devices(arg_directory,
3288                                           root_device, root_device_rw,
3289                                           home_device, home_device_rw,
3290                                           srv_device, srv_device_rw) < 0)
3291                                 _exit(EXIT_FAILURE);
3292
3293                         /* Turn directory into bind mount */
3294                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3295                                 log_error("Failed to make bind mount: %m");
3296                                 _exit(EXIT_FAILURE);
3297                         }
3298
3299                         r = setup_volatile(arg_directory);
3300                         if (r < 0)
3301                                 _exit(EXIT_FAILURE);
3302
3303                         if (setup_volatile_state(arg_directory) < 0)
3304                                 _exit(EXIT_FAILURE);
3305
3306                         r = base_filesystem_create(arg_directory);
3307                         if (r < 0)
3308                                 _exit(EXIT_FAILURE);
3309
3310                         if (arg_read_only) {
3311                                 k = bind_remount_recursive(arg_directory, true);
3312                                 if (k < 0) {
3313                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3314                                         _exit(EXIT_FAILURE);
3315                                 }
3316                         }
3317
3318                         if (mount_all(arg_directory) < 0)
3319                                 _exit(EXIT_FAILURE);
3320
3321                         if (copy_devnodes(arg_directory) < 0)
3322                                 _exit(EXIT_FAILURE);
3323
3324                         if (setup_ptmx(arg_directory) < 0)
3325                                 _exit(EXIT_FAILURE);
3326
3327                         dev_setup(arg_directory);
3328
3329                         if (setup_seccomp() < 0)
3330                                 _exit(EXIT_FAILURE);
3331
3332                         if (setup_dev_console(arg_directory, console) < 0)
3333                                 _exit(EXIT_FAILURE);
3334
3335                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3336                                 _exit(EXIT_FAILURE);
3337
3338                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3339
3340                         if (setup_boot_id(arg_directory) < 0)
3341                                 _exit(EXIT_FAILURE);
3342
3343                         if (setup_timezone(arg_directory) < 0)
3344                                 _exit(EXIT_FAILURE);
3345
3346                         if (setup_resolv_conf(arg_directory) < 0)
3347                                 _exit(EXIT_FAILURE);
3348
3349                         if (setup_journal(arg_directory) < 0)
3350                                 _exit(EXIT_FAILURE);
3351
3352                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3353                                 _exit(EXIT_FAILURE);
3354
3355                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3356                                 _exit(EXIT_FAILURE);
3357
3358                         if (mount_tmpfs(arg_directory) < 0)
3359                                 _exit(EXIT_FAILURE);
3360
3361                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3362                                 _exit(EXIT_FAILURE);
3363
3364                         /* Tell the parent that we are ready, and that
3365                          * it can cgroupify us to that we lack access
3366                          * to certain devices and resources. */
3367                         barrier_place(&barrier);
3368
3369                         if (chdir(arg_directory) < 0) {
3370                                 log_error("chdir(%s) failed: %m", arg_directory);
3371                                 _exit(EXIT_FAILURE);
3372                         }
3373
3374                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3375                                 log_error("mount(MS_MOVE) failed: %m");
3376                                 _exit(EXIT_FAILURE);
3377                         }
3378
3379                         if (chroot(".") < 0) {
3380                                 log_error("chroot() failed: %m");
3381                                 _exit(EXIT_FAILURE);
3382                         }
3383
3384                         if (chdir("/") < 0) {
3385                                 log_error("chdir() failed: %m");
3386                                 _exit(EXIT_FAILURE);
3387                         }
3388
3389                         umask(0022);
3390
3391                         if (arg_private_network)
3392                                 loopback_setup();
3393
3394                         if (drop_capabilities() < 0) {
3395                                 log_error("drop_capabilities() failed: %m");
3396                                 _exit(EXIT_FAILURE);
3397                         }
3398
3399                         r = change_uid_gid(&home);
3400                         if (r < 0)
3401                                 _exit(EXIT_FAILURE);
3402
3403                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3404                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3405                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3406                                 log_oom();
3407                                 _exit(EXIT_FAILURE);
3408                         }
3409
3410                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3411                                 char as_uuid[37];
3412
3413                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3414                                         log_oom();
3415                                         _exit(EXIT_FAILURE);
3416                                 }
3417                         }
3418
3419                         if (fdset_size(fds) > 0) {
3420                                 k = fdset_cloexec(fds, false);
3421                                 if (k < 0) {
3422                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3423                                         _exit(EXIT_FAILURE);
3424                                 }
3425
3426                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3427                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3428                                         log_oom();
3429                                         _exit(EXIT_FAILURE);
3430                                 }
3431                         }
3432
3433                         setup_hostname();
3434
3435                         if (arg_personality != 0xffffffffLU) {
3436                                 if (personality(arg_personality) < 0) {
3437                                         log_error("personality() failed: %m");
3438                                         _exit(EXIT_FAILURE);
3439                                 }
3440                         } else if (secondary) {
3441                                 if (personality(PER_LINUX32) < 0) {
3442                                         log_error("personality() failed: %m");
3443                                         _exit(EXIT_FAILURE);
3444                                 }
3445                         }
3446
3447 #ifdef HAVE_SELINUX
3448                         if (arg_selinux_context)
3449                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3450                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3451                                         _exit(EXIT_FAILURE);
3452                                 }
3453 #endif
3454
3455                         if (!strv_isempty(arg_setenv)) {
3456                                 char **n;
3457
3458                                 n = strv_env_merge(2, envp, arg_setenv);
3459                                 if (!n) {
3460                                         log_oom();
3461                                         _exit(EXIT_FAILURE);
3462                                 }
3463
3464                                 env_use = n;
3465                         } else
3466                                 env_use = (char**) envp;
3467
3468                         /* Wait until the parent is ready with the setup, too... */
3469                         if (!barrier_place_and_sync(&barrier))
3470                                 _exit(EXIT_FAILURE);
3471
3472                         if (arg_boot) {
3473                                 char **a;
3474                                 size_t l;
3475
3476                                 /* Automatically search for the init system */
3477
3478                                 l = 1 + argc - optind;
3479                                 a = newa(char*, l + 1);
3480                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3481
3482                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3483                                 execve(a[0], a, env_use);
3484
3485                                 a[0] = (char*) "/lib/systemd/systemd";
3486                                 execve(a[0], a, env_use);
3487
3488                                 a[0] = (char*) "/sbin/init";
3489                                 execve(a[0], a, env_use);
3490                         } else if (argc > optind)
3491                                 execvpe(argv[optind], argv + optind, env_use);
3492                         else {
3493                                 chdir(home ? home : "/root");
3494                                 execle("/bin/bash", "-bash", NULL, env_use);
3495                                 execle("/bin/sh", "-sh", NULL, env_use);
3496                         }
3497
3498                         log_error("execv() failed: %m");
3499                         _exit(EXIT_FAILURE);
3500                 }
3501
3502                 barrier_set_role(&barrier, BARRIER_PARENT);
3503                 fdset_free(fds);
3504                 fds = NULL;
3505
3506                 /* wait for child-setup to be done */
3507                 if (barrier_place_and_sync(&barrier)) {
3508                         int ifi = 0;
3509
3510                         r = move_network_interfaces(pid);
3511                         if (r < 0)
3512                                 goto finish;
3513
3514                         r = setup_veth(pid, veth_name, &ifi);
3515                         if (r < 0)
3516                                 goto finish;
3517
3518                         r = setup_bridge(veth_name, &ifi);
3519                         if (r < 0)
3520                                 goto finish;
3521
3522                         r = setup_macvlan(pid);
3523                         if (r < 0)
3524                                 goto finish;
3525
3526                         r = register_machine(pid, ifi);
3527                         if (r < 0)
3528                                 goto finish;
3529
3530                         /* Block SIGCHLD here, before notifying child.
3531                          * process_pty() will handle it with the other signals. */
3532                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3533                         if (r < 0)
3534                                 goto finish;
3535
3536                         /* Reset signal to default */
3537                         r = default_signals(SIGCHLD, -1);
3538                         if (r < 0)
3539                                 goto finish;
3540
3541                         /* Notify the child that the parent is ready with all
3542                          * its setup, and that the child can now hand over
3543                          * control to the code to run inside the container. */
3544                         barrier_place(&barrier);
3545
3546                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3547                         if (k < 0) {
3548                                 r = EXIT_FAILURE;
3549                                 break;
3550                         }
3551
3552                         if (!arg_quiet)
3553                                 putc('\n', stdout);
3554
3555                         /* Kill if it is not dead yet anyway */
3556                         terminate_machine(pid);
3557                 }
3558
3559                 /* Normally redundant, but better safe than sorry */
3560                 kill(pid, SIGKILL);
3561
3562                 r = wait_for_container(pid, &container_status);
3563                 pid = 0;
3564
3565                 if (r < 0) {
3566                         /* We failed to wait for the container, or the
3567                          * container exited abnormally */
3568                         r = EXIT_FAILURE;
3569                         break;
3570                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3571                         /* The container exited with a non-zero
3572                          * status, or with zero status and no reboot
3573                          * was requested. */
3574                         break;
3575
3576                 /* CONTAINER_REBOOTED, loop again */
3577
3578                 if (arg_keep_unit) {
3579                         /* Special handling if we are running as a
3580                          * service: instead of simply restarting the
3581                          * machine we want to restart the entire
3582                          * service, so let's inform systemd about this
3583                          * with the special exit code 133. The service
3584                          * file uses RestartForceExitStatus=133 so
3585                          * that this results in a full nspawn
3586                          * restart. This is necessary since we might
3587                          * have cgroup parameters set we want to have
3588                          * flushed out. */
3589                         r = 133;
3590                         break;
3591                 }
3592         }
3593
3594 finish:
3595         sd_notify(false,
3596                   "STOPPING=1\n"
3597                   "STATUS=Terminating...");
3598
3599         loop_remove(loop_nr, &image_fd);
3600
3601         if (pid > 0)
3602                 kill(pid, SIGKILL);
3603
3604         free(arg_directory);
3605         free(arg_machine);
3606         free(arg_user);
3607         strv_free(arg_setenv);
3608         strv_free(arg_network_interfaces);
3609         strv_free(arg_network_macvlan);
3610         strv_free(arg_bind);
3611         strv_free(arg_bind_ro);
3612         strv_free(arg_tmpfs);
3613
3614         return r;
3615 }