chiark / gitweb /
sd-journal: fix sd_journal_enumerate_unique skipping values
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172                "  -h --help                 Show this help\n"
173                "     --version              Print version string\n"
174                "  -q --quiet                Do not show status information\n"
175                "  -D --directory=PATH       Root directory for the container\n"
176                "  -i --image=PATH           File system device or image for the container\n"
177                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
178                "  -u --user=USER            Run the command under specified user or uid\n"
179                "  -M --machine=NAME         Set the machine name for the container\n"
180                "     --uuid=UUID            Set a specific machine UUID for the container\n"
181                "  -S --slice=SLICE          Place the container in the specified slice\n"
182                "     --private-network      Disable network in container\n"
183                "     --network-interface=INTERFACE\n"
184                "                            Assign an existing network interface to the\n"
185                "                            container\n"
186                "     --network-macvlan=INTERFACE\n"
187                "                            Create a macvlan network interface based on an\n"
188                "                            existing network interface to the container\n"
189                "     --network-veth         Add a virtual ethernet connection between host\n"
190                "                            and container\n"
191                "     --network-bridge=INTERFACE\n"
192                "                            Add a virtual ethernet connection between host\n"
193                "                            and container and add it to an existing bridge on\n"
194                "                            the host\n"
195                "  -Z --selinux-context=SECLABEL\n"
196                "                            Set the SELinux security context to be used by\n"
197                "                            processes in the container\n"
198                "  -L --selinux-apifs-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            API/tmpfs file systems in the container\n"
201                "     --capability=CAP       In addition to the default, retain specified\n"
202                "                            capability\n"
203                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
204                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
205                "  -j                        Equivalent to --link-journal=host\n"
206                "     --read-only            Mount the root directory read-only\n"
207                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
208                "                            the container\n"
209                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
210                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
212                "     --share-system         Share system namespaces with host\n"
213                "     --register=BOOLEAN     Register container as machine\n"
214                "     --keep-unit            Do not register a scope for the machine, reuse\n"
215                "                            the service unit nspawn is running in\n"
216                "     --volatile[=MODE]      Run the system in volatile mode\n",
217                program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222         enum {
223                 ARG_VERSION = 0x100,
224                 ARG_PRIVATE_NETWORK,
225                 ARG_UUID,
226                 ARG_READ_ONLY,
227                 ARG_CAPABILITY,
228                 ARG_DROP_CAPABILITY,
229                 ARG_LINK_JOURNAL,
230                 ARG_BIND,
231                 ARG_BIND_RO,
232                 ARG_TMPFS,
233                 ARG_SETENV,
234                 ARG_SHARE_SYSTEM,
235                 ARG_REGISTER,
236                 ARG_KEEP_UNIT,
237                 ARG_NETWORK_INTERFACE,
238                 ARG_NETWORK_MACVLAN,
239                 ARG_NETWORK_VETH,
240                 ARG_NETWORK_BRIDGE,
241                 ARG_PERSONALITY,
242                 ARG_VOLATILE,
243         };
244
245         static const struct option options[] = {
246                 { "help",                  no_argument,       NULL, 'h'                   },
247                 { "version",               no_argument,       NULL, ARG_VERSION           },
248                 { "directory",             required_argument, NULL, 'D'                   },
249                 { "user",                  required_argument, NULL, 'u'                   },
250                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
251                 { "boot",                  no_argument,       NULL, 'b'                   },
252                 { "uuid",                  required_argument, NULL, ARG_UUID              },
253                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
254                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
255                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
256                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
257                 { "bind",                  required_argument, NULL, ARG_BIND              },
258                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
259                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
260                 { "machine",               required_argument, NULL, 'M'                   },
261                 { "slice",                 required_argument, NULL, 'S'                   },
262                 { "setenv",                required_argument, NULL, ARG_SETENV            },
263                 { "selinux-context",       required_argument, NULL, 'Z'                   },
264                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
265                 { "quiet",                 no_argument,       NULL, 'q'                   },
266                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
267                 { "register",              required_argument, NULL, ARG_REGISTER          },
268                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
269                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
270                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
271                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
272                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
273                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
274                 { "image",                 required_argument, NULL, 'i'                   },
275                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
276                 {}
277         };
278
279         int c, r;
280         uint64_t plus = 0, minus = 0;
281
282         assert(argc >= 0);
283         assert(argv);
284
285         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287                 switch (c) {
288
289                 case 'h':
290                         help();
291                         return 0;
292
293                 case ARG_VERSION:
294                         puts(PACKAGE_STRING);
295                         puts(SYSTEMD_FEATURES);
296                         return 0;
297
298                 case 'D':
299                         free(arg_directory);
300                         arg_directory = canonicalize_file_name(optarg);
301                         if (!arg_directory) {
302                                 log_error("Invalid root directory: %m");
303                                 return -ENOMEM;
304                         }
305
306                         break;
307
308                 case 'i':
309                         arg_image = optarg;
310                         break;
311
312                 case 'u':
313                         free(arg_user);
314                         arg_user = strdup(optarg);
315                         if (!arg_user)
316                                 return log_oom();
317
318                         break;
319
320                 case ARG_NETWORK_BRIDGE:
321                         arg_network_bridge = optarg;
322
323                         /* fall through */
324
325                 case ARG_NETWORK_VETH:
326                         arg_network_veth = true;
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_INTERFACE:
331                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
332                                 return log_oom();
333
334                         arg_private_network = true;
335                         break;
336
337                 case ARG_NETWORK_MACVLAN:
338                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
339                                 return log_oom();
340
341                         /* fall through */
342
343                 case ARG_PRIVATE_NETWORK:
344                         arg_private_network = true;
345                         break;
346
347                 case 'b':
348                         arg_boot = true;
349                         break;
350
351                 case ARG_UUID:
352                         r = sd_id128_from_string(optarg, &arg_uuid);
353                         if (r < 0) {
354                                 log_error("Invalid UUID: %s", optarg);
355                                 return r;
356                         }
357                         break;
358
359                 case 'S':
360                         arg_slice = optarg;
361                         break;
362
363                 case 'M':
364                         if (isempty(optarg)) {
365                                 free(arg_machine);
366                                 arg_machine = NULL;
367                         } else {
368
369                                 if (!hostname_is_valid(optarg)) {
370                                         log_error("Invalid machine name: %s", optarg);
371                                         return -EINVAL;
372                                 }
373
374                                 free(arg_machine);
375                                 arg_machine = strdup(optarg);
376                                 if (!arg_machine)
377                                         return log_oom();
378
379                                 break;
380                         }
381
382                 case 'Z':
383                         arg_selinux_context = optarg;
384                         break;
385
386                 case 'L':
387                         arg_selinux_apifs_context = optarg;
388                         break;
389
390                 case ARG_READ_ONLY:
391                         arg_read_only = true;
392                         break;
393
394                 case ARG_CAPABILITY:
395                 case ARG_DROP_CAPABILITY: {
396                         const char *state, *word;
397                         size_t length;
398
399                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400                                 _cleanup_free_ char *t;
401                                 cap_value_t cap;
402
403                                 t = strndup(word, length);
404                                 if (!t)
405                                         return log_oom();
406
407                                 if (streq(t, "all")) {
408                                         if (c == ARG_CAPABILITY)
409                                                 plus = (uint64_t) -1;
410                                         else
411                                                 minus = (uint64_t) -1;
412                                 } else {
413                                         if (cap_from_name(t, &cap) < 0) {
414                                                 log_error("Failed to parse capability %s.", t);
415                                                 return -EINVAL;
416                                         }
417
418                                         if (c == ARG_CAPABILITY)
419                                                 plus |= 1ULL << (uint64_t) cap;
420                                         else
421                                                 minus |= 1ULL << (uint64_t) cap;
422                                 }
423                         }
424
425                         break;
426                 }
427
428                 case 'j':
429                         arg_link_journal = LINK_GUEST;
430                         break;
431
432                 case ARG_LINK_JOURNAL:
433                         if (streq(optarg, "auto"))
434                                 arg_link_journal = LINK_AUTO;
435                         else if (streq(optarg, "no"))
436                                 arg_link_journal = LINK_NO;
437                         else if (streq(optarg, "guest"))
438                                 arg_link_journal = LINK_GUEST;
439                         else if (streq(optarg, "host"))
440                                 arg_link_journal = LINK_HOST;
441                         else {
442                                 log_error("Failed to parse link journal mode %s", optarg);
443                                 return -EINVAL;
444                         }
445
446                         break;
447
448                 case ARG_BIND:
449                 case ARG_BIND_RO: {
450                         _cleanup_free_ char *a = NULL, *b = NULL;
451                         char *e;
452                         char ***x;
453
454                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456                         e = strchr(optarg, ':');
457                         if (e) {
458                                 a = strndup(optarg, e - optarg);
459                                 b = strdup(e + 1);
460                         } else {
461                                 a = strdup(optarg);
462                                 b = strdup(optarg);
463                         }
464
465                         if (!a || !b)
466                                 return log_oom();
467
468                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
469                                 log_error("Invalid bind mount specification: %s", optarg);
470                                 return -EINVAL;
471                         }
472
473                         r = strv_extend(x, a);
474                         if (r < 0)
475                                 return log_oom();
476
477                         r = strv_extend(x, b);
478                         if (r < 0)
479                                 return log_oom();
480
481                         break;
482                 }
483
484                 case ARG_TMPFS: {
485                         _cleanup_free_ char *a = NULL, *b = NULL;
486                         char *e;
487
488                         e = strchr(optarg, ':');
489                         if (e) {
490                                 a = strndup(optarg, e - optarg);
491                                 b = strdup(e + 1);
492                         } else {
493                                 a = strdup(optarg);
494                                 b = strdup("mode=0755");
495                         }
496
497                         if (!a || !b)
498                                 return log_oom();
499
500                         if (!path_is_absolute(a)) {
501                                 log_error("Invalid tmpfs specification: %s", optarg);
502                                 return -EINVAL;
503                         }
504
505                         r = strv_push(&arg_tmpfs, a);
506                         if (r < 0)
507                                 return log_oom();
508
509                         a = NULL;
510
511                         r = strv_push(&arg_tmpfs, b);
512                         if (r < 0)
513                                 return log_oom();
514
515                         b = NULL;
516
517                         break;
518                 }
519
520                 case ARG_SETENV: {
521                         char **n;
522
523                         if (!env_assignment_is_valid(optarg)) {
524                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
525                                 return -EINVAL;
526                         }
527
528                         n = strv_env_set(arg_setenv, optarg);
529                         if (!n)
530                                 return log_oom();
531
532                         strv_free(arg_setenv);
533                         arg_setenv = n;
534                         break;
535                 }
536
537                 case 'q':
538                         arg_quiet = true;
539                         break;
540
541                 case ARG_SHARE_SYSTEM:
542                         arg_share_system = true;
543                         break;
544
545                 case ARG_REGISTER:
546                         r = parse_boolean(optarg);
547                         if (r < 0) {
548                                 log_error("Failed to parse --register= argument: %s", optarg);
549                                 return r;
550                         }
551
552                         arg_register = r;
553                         break;
554
555                 case ARG_KEEP_UNIT:
556                         arg_keep_unit = true;
557                         break;
558
559                 case ARG_PERSONALITY:
560
561                         arg_personality = personality_from_string(optarg);
562                         if (arg_personality == 0xffffffffLU) {
563                                 log_error("Unknown or unsupported personality '%s'.", optarg);
564                                 return -EINVAL;
565                         }
566
567                         break;
568
569                 case ARG_VOLATILE:
570
571                         if (!optarg)
572                                 arg_volatile = VOLATILE_YES;
573                         else {
574                                 r = parse_boolean(optarg);
575                                 if (r < 0) {
576                                         if (streq(optarg, "state"))
577                                                 arg_volatile = VOLATILE_STATE;
578                                         else {
579                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
580                                                 return r;
581                                         }
582                                 } else
583                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584                         }
585
586                         break;
587
588                 case '?':
589                         return -EINVAL;
590
591                 default:
592                         assert_not_reached("Unhandled option");
593                 }
594
595         if (arg_share_system)
596                 arg_register = false;
597
598         if (arg_boot && arg_share_system) {
599                 log_error("--boot and --share-system may not be combined.");
600                 return -EINVAL;
601         }
602
603         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604                 log_error("--keep-unit may not be used when invoked from a user session.");
605                 return -EINVAL;
606         }
607
608         if (arg_directory && arg_image) {
609                 log_error("--directory= and --image= may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_volatile != VOLATILE_NO && arg_read_only) {
614                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615                 return -EINVAL;
616         }
617
618         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620         return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625         typedef struct MountPoint {
626                 const char *what;
627                 const char *where;
628                 const char *type;
629                 const char *options;
630                 unsigned long flags;
631                 bool fatal;
632         } MountPoint;
633
634         static const MountPoint mount_table[] = {
635                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
636                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
637                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
638                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
639                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
640                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
642                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643 #ifdef HAVE_SELINUX
644                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
645                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
646 #endif
647         };
648
649         unsigned k;
650         int r = 0;
651
652         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653                 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655                 _cleanup_free_ char *options = NULL;
656 #endif
657                 const char *o;
658                 int t;
659
660                 where = strjoin(dest, "/", mount_table[k].where, NULL);
661                 if (!where)
662                         return log_oom();
663
664                 t = path_is_mount_point(where, true);
665                 if (t < 0) {
666                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668                         if (r == 0)
669                                 r = t;
670
671                         continue;
672                 }
673
674                 /* Skip this entry if it is not a remount. */
675                 if (mount_table[k].what && t > 0)
676                         continue;
677
678                 t = mkdir_p(where, 0755);
679                 if (t < 0) {
680                         if (mount_table[k].fatal) {
681                                log_error("Failed to create directory %s: %s", where, strerror(-t));
682
683                                 if (r == 0)
684                                         r = t;
685                         } else
686                                log_warning("Failed to create directory %s: %s", where, strerror(-t));
687
688                         continue;
689                 }
690
691 #ifdef HAVE_SELINUX
692                 if (arg_selinux_apifs_context &&
693                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
694                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
695                         if (!options)
696                                 return log_oom();
697
698                         o = options;
699                 } else
700 #endif
701                         o = mount_table[k].options;
702
703
704                 if (mount(mount_table[k].what,
705                           where,
706                           mount_table[k].type,
707                           mount_table[k].flags,
708                           o) < 0) {
709
710                         if (mount_table[k].fatal) {
711                                 log_error("mount(%s) failed: %m", where);
712
713                                 if (r == 0)
714                                         r = -errno;
715                         } else
716                                 log_warning("mount(%s) failed: %m", where);
717                 }
718         }
719
720         return r;
721 }
722
723 static int mount_binds(const char *dest, char **l, bool ro) {
724         char **x, **y;
725
726         STRV_FOREACH_PAIR(x, y, l) {
727                 _cleanup_free_ char *where = NULL;
728                 struct stat source_st, dest_st;
729                 int r;
730
731                 if (stat(*x, &source_st) < 0) {
732                         log_error("Failed to stat %s: %m", *x);
733                         return -errno;
734                 }
735
736                 where = strappend(dest, *y);
737                 if (!where)
738                         return log_oom();
739
740                 r = stat(where, &dest_st);
741                 if (r == 0) {
742                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
743                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
744                                 return -EINVAL;
745                         }
746                 } else if (errno == ENOENT) {
747                         r = mkdir_parents_label(where, 0755);
748                         if (r < 0) {
749                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
750                                 return r;
751                         }
752                 } else {
753                         log_error("Failed to bind mount %s: %m", *x);
754                         return -errno;
755                 }
756
757                 /* Create the mount point, but be conservative -- refuse to create block
758                  * and char devices. */
759                 if (S_ISDIR(source_st.st_mode)) {
760                         r = mkdir_label(where, 0755);
761                         if (r < 0) {
762                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
763
764                                 return r;
765                         }
766                 } else if (S_ISFIFO(source_st.st_mode)) {
767                         r = mkfifo(where, 0644);
768                         if (r < 0 && errno != EEXIST) {
769                                 log_error("Failed to create mount point %s: %m", where);
770
771                                 return -errno;
772                         }
773                 } else if (S_ISSOCK(source_st.st_mode)) {
774                         r = mknod(where, 0644 | S_IFSOCK, 0);
775                         if (r < 0 && errno != EEXIST) {
776                                 log_error("Failed to create mount point %s: %m", where);
777
778                                 return -errno;
779                         }
780                 } else if (S_ISREG(source_st.st_mode)) {
781                         r = touch(where);
782                         if (r < 0) {
783                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
784
785                                 return r;
786                         }
787                 } else {
788                         log_error("Refusing to create mountpoint for file: %s", *x);
789                         return -ENOTSUP;
790                 }
791
792                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
793                         log_error("mount(%s) failed: %m", where);
794                         return -errno;
795                 }
796
797                 if (ro) {
798                         r = bind_remount_recursive(where, true);
799                         if (r < 0) {
800                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
801                                 return r;
802                         }
803                 }
804         }
805
806         return 0;
807 }
808
809 static int mount_tmpfs(const char *dest) {
810         char **i, **o;
811
812         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813                 _cleanup_free_ char *where = NULL;
814                 int r;
815
816                 where = strappend(dest, *i);
817                 if (!where)
818                         return log_oom();
819
820                 r = mkdir_label(where, 0755);
821                 if (r < 0) {
822                         log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
823
824                         return r;
825                 }
826
827                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
828                         log_error("tmpfs mount to %s failed: %m", where);
829                         return -errno;
830                 }
831         }
832
833         return 0;
834 }
835
836 static int setup_timezone(const char *dest) {
837         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
838         char *z, *y;
839         int r;
840
841         assert(dest);
842
843         /* Fix the timezone, if possible */
844         r = readlink_malloc("/etc/localtime", &p);
845         if (r < 0) {
846                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
847                 return 0;
848         }
849
850         z = path_startswith(p, "../usr/share/zoneinfo/");
851         if (!z)
852                 z = path_startswith(p, "/usr/share/zoneinfo/");
853         if (!z) {
854                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
855                 return 0;
856         }
857
858         where = strappend(dest, "/etc/localtime");
859         if (!where)
860                 return log_oom();
861
862         r = readlink_malloc(where, &q);
863         if (r >= 0) {
864                 y = path_startswith(q, "../usr/share/zoneinfo/");
865                 if (!y)
866                         y = path_startswith(q, "/usr/share/zoneinfo/");
867
868                 /* Already pointing to the right place? Then do nothing .. */
869                 if (y && streq(y, z))
870                         return 0;
871         }
872
873         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
874         if (!check)
875                 return log_oom();
876
877         if (access(check, F_OK) < 0) {
878                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
879                 return 0;
880         }
881
882         what = strappend("../usr/share/zoneinfo/", z);
883         if (!what)
884                 return log_oom();
885
886         r = mkdir_parents(where, 0755);
887         if (r < 0) {
888                 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
889
890                 return 0;
891         }
892
893         r = unlink(where);
894         if (r < 0 && errno != ENOENT) {
895                 log_error("Failed to remove existing timezone info %s in container: %m", where);
896
897                 return 0;
898         }
899
900         if (symlink(what, where) < 0) {
901                 log_error("Failed to correct timezone of container: %m");
902                 return 0;
903         }
904
905         return 0;
906 }
907
908 static int setup_resolv_conf(const char *dest) {
909         _cleanup_free_ char *where = NULL;
910         int r;
911
912         assert(dest);
913
914         if (arg_private_network)
915                 return 0;
916
917         /* Fix resolv.conf, if possible */
918         where = strappend(dest, "/etc/resolv.conf");
919         if (!where)
920                 return log_oom();
921
922         /* We don't really care for the results of this really. If it
923          * fails, it fails, but meh... */
924         r = mkdir_parents(where, 0755);
925         if (r < 0) {
926                 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
927
928                 return 0;
929         }
930
931         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
932         if (r < 0) {
933                 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
934
935                 return 0;
936         }
937
938         return 0;
939 }
940
941 static int setup_volatile_state(const char *directory) {
942         const char *p;
943         int r;
944
945         assert(directory);
946
947         if (arg_volatile != VOLATILE_STATE)
948                 return 0;
949
950         /* --volatile=state means we simply overmount /var
951            with a tmpfs, and the rest read-only. */
952
953         r = bind_remount_recursive(directory, true);
954         if (r < 0) {
955                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
956                 return r;
957         }
958
959         p = strappenda(directory, "/var");
960         r = mkdir(p, 0755);
961         if (r < 0 && errno != EEXIST) {
962                 log_error("Failed to create %s: %m", directory);
963                 return -errno;
964         }
965
966         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
967                 log_error("Failed to mount tmpfs to /var: %m");
968                 return -errno;
969         }
970
971         return 0;
972 }
973
974 static int setup_volatile(const char *directory) {
975         bool tmpfs_mounted = false, bind_mounted = false;
976         char template[] = "/tmp/nspawn-volatile-XXXXXX";
977         const char *f, *t;
978         int r;
979
980         assert(directory);
981
982         if (arg_volatile != VOLATILE_YES)
983                 return 0;
984
985         /* --volatile=yes means we mount a tmpfs to the root dir, and
986            the original /usr to use inside it, and that read-only. */
987
988         if (!mkdtemp(template)) {
989                 log_error("Failed to create temporary directory: %m");
990                 return -errno;
991         }
992
993         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
994                 log_error("Failed to mount tmpfs for root directory: %m");
995                 r = -errno;
996                 goto fail;
997         }
998
999         tmpfs_mounted = true;
1000
1001         f = strappenda(directory, "/usr");
1002         t = strappenda(template, "/usr");
1003
1004         r = mkdir(t, 0755);
1005         if (r < 0 && errno != EEXIST) {
1006                 log_error("Failed to create %s: %m", t);
1007                 r = -errno;
1008                 goto fail;
1009         }
1010
1011         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1012                 log_error("Failed to create /usr bind mount: %m");
1013                 r = -errno;
1014                 goto fail;
1015         }
1016
1017         bind_mounted = true;
1018
1019         r = bind_remount_recursive(t, true);
1020         if (r < 0) {
1021                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1022                 goto fail;
1023         }
1024
1025         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1026                 log_error("Failed to move root mount: %m");
1027                 r = -errno;
1028                 goto fail;
1029         }
1030
1031         rmdir(template);
1032
1033         return 0;
1034
1035 fail:
1036         if (bind_mounted)
1037                 umount(t);
1038         if (tmpfs_mounted)
1039                 umount(template);
1040         rmdir(template);
1041         return r;
1042 }
1043
1044 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1045
1046         snprintf(s, 37,
1047                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1048                  SD_ID128_FORMAT_VAL(id));
1049
1050         return s;
1051 }
1052
1053 static int setup_boot_id(const char *dest) {
1054         _cleanup_free_ char *from = NULL, *to = NULL;
1055         sd_id128_t rnd = {};
1056         char as_uuid[37];
1057         int r;
1058
1059         assert(dest);
1060
1061         if (arg_share_system)
1062                 return 0;
1063
1064         /* Generate a new randomized boot ID, so that each boot-up of
1065          * the container gets a new one */
1066
1067         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1068         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1069         if (!from || !to)
1070                 return log_oom();
1071
1072         r = sd_id128_randomize(&rnd);
1073         if (r < 0) {
1074                 log_error("Failed to generate random boot id: %s", strerror(-r));
1075                 return r;
1076         }
1077
1078         id128_format_as_uuid(rnd, as_uuid);
1079
1080         r = write_string_file(from, as_uuid);
1081         if (r < 0) {
1082                 log_error("Failed to write boot id: %s", strerror(-r));
1083                 return r;
1084         }
1085
1086         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1087                 log_error("Failed to bind mount boot id: %m");
1088                 r = -errno;
1089         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1090                 log_warning("Failed to make boot id read-only: %m");
1091
1092         unlink(from);
1093         return r;
1094 }
1095
1096 static int copy_devnodes(const char *dest) {
1097
1098         static const char devnodes[] =
1099                 "null\0"
1100                 "zero\0"
1101                 "full\0"
1102                 "random\0"
1103                 "urandom\0"
1104                 "tty\0"
1105                 "net/tun\0";
1106
1107         const char *d;
1108         int r = 0;
1109         _cleanup_umask_ mode_t u;
1110
1111         assert(dest);
1112
1113         u = umask(0000);
1114
1115         NULSTR_FOREACH(d, devnodes) {
1116                 _cleanup_free_ char *from = NULL, *to = NULL;
1117                 struct stat st;
1118
1119                 from = strappend("/dev/", d);
1120                 to = strjoin(dest, "/dev/", d, NULL);
1121                 if (!from || !to)
1122                         return log_oom();
1123
1124                 if (stat(from, &st) < 0) {
1125
1126                         if (errno != ENOENT) {
1127                                 log_error("Failed to stat %s: %m", from);
1128                                 return -errno;
1129                         }
1130
1131                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1132
1133                         log_error("%s is not a char or block device, cannot copy", from);
1134                         return -EIO;
1135
1136                 } else {
1137                         r = mkdir_parents(to, 0775);
1138                         if (r < 0) {
1139                                 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1140                                 return -r;
1141                         }
1142
1143                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1144                                 log_error("mknod(%s) failed: %m", dest);
1145                                 return  -errno;
1146                         }
1147                 }
1148         }
1149
1150         return r;
1151 }
1152
1153 static int setup_ptmx(const char *dest) {
1154         _cleanup_free_ char *p = NULL;
1155
1156         p = strappend(dest, "/dev/ptmx");
1157         if (!p)
1158                 return log_oom();
1159
1160         if (symlink("pts/ptmx", p) < 0) {
1161                 log_error("Failed to create /dev/ptmx symlink: %m");
1162                 return -errno;
1163         }
1164
1165         return 0;
1166 }
1167
1168 static int setup_dev_console(const char *dest, const char *console) {
1169         _cleanup_umask_ mode_t u;
1170         const char *to;
1171         struct stat st;
1172         int r;
1173
1174         assert(dest);
1175         assert(console);
1176
1177         u = umask(0000);
1178
1179         if (stat("/dev/null", &st) < 0) {
1180                 log_error("Failed to stat /dev/null: %m");
1181                 return -errno;
1182         }
1183
1184         r = chmod_and_chown(console, 0600, 0, 0);
1185         if (r < 0) {
1186                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1187                 return r;
1188         }
1189
1190         /* We need to bind mount the right tty to /dev/console since
1191          * ptys can only exist on pts file systems. To have something
1192          * to bind mount things on we create a device node first, and
1193          * use /dev/null for that since we the cgroups device policy
1194          * allows us to create that freely, while we cannot create
1195          * /dev/console. (Note that the major minor doesn't actually
1196          * matter here, since we mount it over anyway). */
1197
1198         to = strappenda(dest, "/dev/console");
1199         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1200                 log_error("mknod() for /dev/console failed: %m");
1201                 return -errno;
1202         }
1203
1204         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1205                 log_error("Bind mount for /dev/console failed: %m");
1206                 return -errno;
1207         }
1208
1209         return 0;
1210 }
1211
1212 static int setup_kmsg(const char *dest, int kmsg_socket) {
1213         _cleanup_free_ char *from = NULL, *to = NULL;
1214         int r, fd, k;
1215         _cleanup_umask_ mode_t u;
1216         union {
1217                 struct cmsghdr cmsghdr;
1218                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1219         } control = {};
1220         struct msghdr mh = {
1221                 .msg_control = &control,
1222                 .msg_controllen = sizeof(control),
1223         };
1224         struct cmsghdr *cmsg;
1225
1226         assert(dest);
1227         assert(kmsg_socket >= 0);
1228
1229         u = umask(0000);
1230
1231         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1232          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1233          * on the reading side behave very similar to /proc/kmsg,
1234          * their writing side behaves differently from /dev/kmsg in
1235          * that writing blocks when nothing is reading. In order to
1236          * avoid any problems with containers deadlocking due to this
1237          * we simply make /dev/kmsg unavailable to the container. */
1238         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1239             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1240                 return log_oom();
1241
1242         if (mkfifo(from, 0600) < 0) {
1243                 log_error("mkfifo() for /dev/kmsg failed: %m");
1244                 return -errno;
1245         }
1246
1247         r = chmod_and_chown(from, 0600, 0, 0);
1248         if (r < 0) {
1249                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1250                 return r;
1251         }
1252
1253         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1254                 log_error("Bind mount for /proc/kmsg failed: %m");
1255                 return -errno;
1256         }
1257
1258         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1259         if (fd < 0) {
1260                 log_error("Failed to open fifo: %m");
1261                 return -errno;
1262         }
1263
1264         cmsg = CMSG_FIRSTHDR(&mh);
1265         cmsg->cmsg_level = SOL_SOCKET;
1266         cmsg->cmsg_type = SCM_RIGHTS;
1267         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1268         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1269
1270         mh.msg_controllen = cmsg->cmsg_len;
1271
1272         /* Store away the fd in the socket, so that it stays open as
1273          * long as we run the child */
1274         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1275         safe_close(fd);
1276
1277         if (k < 0) {
1278                 log_error("Failed to send FIFO fd: %m");
1279                 return -errno;
1280         }
1281
1282         /* And now make the FIFO unavailable as /dev/kmsg... */
1283         unlink(from);
1284         return 0;
1285 }
1286
1287 static int setup_hostname(void) {
1288
1289         if (arg_share_system)
1290                 return 0;
1291
1292         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1293                 return -errno;
1294
1295         return 0;
1296 }
1297
1298 static int setup_journal(const char *directory) {
1299         sd_id128_t machine_id, this_id;
1300         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1301         char *id;
1302         int r;
1303
1304         p = strappend(directory, "/etc/machine-id");
1305         if (!p)
1306                 return log_oom();
1307
1308         r = read_one_line_file(p, &b);
1309         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1310                 return 0;
1311         else if (r < 0) {
1312                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1313                 return r;
1314         }
1315
1316         id = strstrip(b);
1317         if (isempty(id) && arg_link_journal == LINK_AUTO)
1318                 return 0;
1319
1320         /* Verify validity */
1321         r = sd_id128_from_string(id, &machine_id);
1322         if (r < 0) {
1323                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1324                 return r;
1325         }
1326
1327         r = sd_id128_get_machine(&this_id);
1328         if (r < 0) {
1329                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1330                 return r;
1331         }
1332
1333         if (sd_id128_equal(machine_id, this_id)) {
1334                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1335                          "Host and machine ids are equal (%s): refusing to link journals", id);
1336                 if (arg_link_journal == LINK_AUTO)
1337                         return 0;
1338                 return
1339                         -EEXIST;
1340         }
1341
1342         if (arg_link_journal == LINK_NO)
1343                 return 0;
1344
1345         free(p);
1346         p = strappend("/var/log/journal/", id);
1347         q = strjoin(directory, "/var/log/journal/", id, NULL);
1348         if (!p || !q)
1349                 return log_oom();
1350
1351         if (path_is_mount_point(p, false) > 0) {
1352                 if (arg_link_journal != LINK_AUTO) {
1353                         log_error("%s: already a mount point, refusing to use for journal", p);
1354                         return -EEXIST;
1355                 }
1356
1357                 return 0;
1358         }
1359
1360         if (path_is_mount_point(q, false) > 0) {
1361                 if (arg_link_journal != LINK_AUTO) {
1362                         log_error("%s: already a mount point, refusing to use for journal", q);
1363                         return -EEXIST;
1364                 }
1365
1366                 return 0;
1367         }
1368
1369         r = readlink_and_make_absolute(p, &d);
1370         if (r >= 0) {
1371                 if ((arg_link_journal == LINK_GUEST ||
1372                      arg_link_journal == LINK_AUTO) &&
1373                     path_equal(d, q)) {
1374
1375                         r = mkdir_p(q, 0755);
1376                         if (r < 0)
1377                                 log_warning("Failed to create directory %s: %m", q);
1378                         return 0;
1379                 }
1380
1381                 if (unlink(p) < 0) {
1382                         log_error("Failed to remove symlink %s: %m", p);
1383                         return -errno;
1384                 }
1385         } else if (r == -EINVAL) {
1386
1387                 if (arg_link_journal == LINK_GUEST &&
1388                     rmdir(p) < 0) {
1389
1390                         if (errno == ENOTDIR) {
1391                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1392                                 return r;
1393                         } else {
1394                                 log_error("Failed to remove %s: %m", p);
1395                                 return -errno;
1396                         }
1397                 }
1398         } else if (r != -ENOENT) {
1399                 log_error("readlink(%s) failed: %m", p);
1400                 return r;
1401         }
1402
1403         if (arg_link_journal == LINK_GUEST) {
1404
1405                 if (symlink(q, p) < 0) {
1406                         log_error("Failed to symlink %s to %s: %m", q, p);
1407                         return -errno;
1408                 }
1409
1410                 r = mkdir_p(q, 0755);
1411                 if (r < 0)
1412                         log_warning("Failed to create directory %s: %m", q);
1413                 return 0;
1414         }
1415
1416         if (arg_link_journal == LINK_HOST) {
1417                 r = mkdir_p(p, 0755);
1418                 if (r < 0) {
1419                         log_error("Failed to create %s: %m", p);
1420                         return r;
1421                 }
1422
1423         } else if (access(p, F_OK) < 0)
1424                 return 0;
1425
1426         if (dir_is_empty(q) == 0)
1427                 log_warning("%s is not empty, proceeding anyway.", q);
1428
1429         r = mkdir_p(q, 0755);
1430         if (r < 0) {
1431                 log_error("Failed to create %s: %m", q);
1432                 return r;
1433         }
1434
1435         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1436                 log_error("Failed to bind mount journal from host into guest: %m");
1437                 return -errno;
1438         }
1439
1440         return 0;
1441 }
1442
1443 static int setup_kdbus(const char *dest, const char *path) {
1444         const char *p;
1445
1446         if (!path)
1447                 return 0;
1448
1449         p = strappenda(dest, "/dev/kdbus");
1450         if (mkdir(p, 0755) < 0) {
1451                 log_error("Failed to create kdbus path: %m");
1452                 return  -errno;
1453         }
1454
1455         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1456                 log_error("Failed to mount kdbus domain path: %m");
1457                 return -errno;
1458         }
1459
1460         return 0;
1461 }
1462
1463 static int drop_capabilities(void) {
1464         return capability_bounding_set_drop(~arg_retain, false);
1465 }
1466
1467 static int register_machine(pid_t pid, int local_ifindex) {
1468         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1469         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1470         int r;
1471
1472         if (!arg_register)
1473                 return 0;
1474
1475         r = sd_bus_default_system(&bus);
1476         if (r < 0) {
1477                 log_error("Failed to open system bus: %s", strerror(-r));
1478                 return r;
1479         }
1480
1481         if (arg_keep_unit) {
1482                 r = sd_bus_call_method(
1483                                 bus,
1484                                 "org.freedesktop.machine1",
1485                                 "/org/freedesktop/machine1",
1486                                 "org.freedesktop.machine1.Manager",
1487                                 "RegisterMachineWithNetwork",
1488                                 &error,
1489                                 NULL,
1490                                 "sayssusai",
1491                                 arg_machine,
1492                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1493                                 "nspawn",
1494                                 "container",
1495                                 (uint32_t) pid,
1496                                 strempty(arg_directory),
1497                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1498         } else {
1499                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1500
1501                 r = sd_bus_message_new_method_call(
1502                                 bus,
1503                                 &m,
1504                                 "org.freedesktop.machine1",
1505                                 "/org/freedesktop/machine1",
1506                                 "org.freedesktop.machine1.Manager",
1507                                 "CreateMachineWithNetwork");
1508                 if (r < 0) {
1509                         log_error("Failed to create message: %s", strerror(-r));
1510                         return r;
1511                 }
1512
1513                 r = sd_bus_message_append(
1514                                 m,
1515                                 "sayssusai",
1516                                 arg_machine,
1517                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1518                                 "nspawn",
1519                                 "container",
1520                                 (uint32_t) pid,
1521                                 strempty(arg_directory),
1522                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1523                 if (r < 0) {
1524                         log_error("Failed to append message arguments: %s", strerror(-r));
1525                         return r;
1526                 }
1527
1528                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529                 if (r < 0) {
1530                         log_error("Failed to open container: %s", strerror(-r));
1531                         return r;
1532                 }
1533
1534                 if (!isempty(arg_slice)) {
1535                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1536                         if (r < 0) {
1537                                 log_error("Failed to append slice: %s", strerror(-r));
1538                                 return r;
1539                         }
1540                 }
1541
1542                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1543                 if (r < 0) {
1544                         log_error("Failed to add device policy: %s", strerror(-r));
1545                         return r;
1546                 }
1547
1548                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1549                                           /* Allow the container to
1550                                            * access and create the API
1551                                            * device nodes, so that
1552                                            * PrivateDevices= in the
1553                                            * container can work
1554                                            * fine */
1555                                           "/dev/null", "rwm",
1556                                           "/dev/zero", "rwm",
1557                                           "/dev/full", "rwm",
1558                                           "/dev/random", "rwm",
1559                                           "/dev/urandom", "rwm",
1560                                           "/dev/tty", "rwm",
1561                                           /* Allow the container
1562                                            * access to ptys. However,
1563                                            * do not permit the
1564                                            * container to ever create
1565                                            * these device nodes. */
1566                                           "/dev/pts/ptmx", "rw",
1567                                           "char-pts", "rw",
1568                                           /* Allow the container
1569                                            * access to all kdbus
1570                                            * devices. Again, the
1571                                            * container cannot create
1572                                            * these nodes, only use
1573                                            * them. We use a pretty
1574                                            * open match here, so that
1575                                            * the kernel API can still
1576                                            * change. */
1577                                           "char-kdbus", "rw",
1578                                           "char-kdbus/*", "rw");
1579                 if (r < 0) {
1580                         log_error("Failed to add device whitelist: %s", strerror(-r));
1581                         return r;
1582                 }
1583
1584                 r = sd_bus_message_close_container(m);
1585                 if (r < 0) {
1586                         log_error("Failed to close container: %s", strerror(-r));
1587                         return r;
1588                 }
1589
1590                 r = sd_bus_call(bus, m, 0, &error, NULL);
1591         }
1592
1593         if (r < 0) {
1594                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1595                 return r;
1596         }
1597
1598         return 0;
1599 }
1600
1601 static int terminate_machine(pid_t pid) {
1602         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1603         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1604         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1605         const char *path;
1606         int r;
1607
1608         if (!arg_register)
1609                 return 0;
1610
1611         r = sd_bus_default_system(&bus);
1612         if (r < 0) {
1613                 log_error("Failed to open system bus: %s", strerror(-r));
1614                 return r;
1615         }
1616
1617         r = sd_bus_call_method(
1618                         bus,
1619                         "org.freedesktop.machine1",
1620                         "/org/freedesktop/machine1",
1621                         "org.freedesktop.machine1.Manager",
1622                         "GetMachineByPID",
1623                         &error,
1624                         &reply,
1625                         "u",
1626                         (uint32_t) pid);
1627         if (r < 0) {
1628                 /* Note that the machine might already have been
1629                  * cleaned up automatically, hence don't consider it a
1630                  * failure if we cannot get the machine object. */
1631                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1632                 return 0;
1633         }
1634
1635         r = sd_bus_message_read(reply, "o", &path);
1636         if (r < 0)
1637                 return bus_log_parse_error(r);
1638
1639         r = sd_bus_call_method(
1640                         bus,
1641                         "org.freedesktop.machine1",
1642                         path,
1643                         "org.freedesktop.machine1.Machine",
1644                         "Terminate",
1645                         &error,
1646                         NULL,
1647                         NULL);
1648         if (r < 0) {
1649                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1650                 return 0;
1651         }
1652
1653         return 0;
1654 }
1655
1656 static int reset_audit_loginuid(void) {
1657         _cleanup_free_ char *p = NULL;
1658         int r;
1659
1660         if (arg_share_system)
1661                 return 0;
1662
1663         r = read_one_line_file("/proc/self/loginuid", &p);
1664         if (r == -ENOENT)
1665                 return 0;
1666         if (r < 0) {
1667                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1668                 return r;
1669         }
1670
1671         /* Already reset? */
1672         if (streq(p, "4294967295"))
1673                 return 0;
1674
1675         r = write_string_file("/proc/self/loginuid", "4294967295");
1676         if (r < 0) {
1677                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1678                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1679                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1680                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1681                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1682
1683                 sleep(5);
1684         }
1685
1686         return 0;
1687 }
1688
1689 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1690 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1691
1692 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1693         int r;
1694
1695         uint8_t result[8];
1696         size_t l, sz;
1697         uint8_t *v;
1698
1699         l = strlen(arg_machine);
1700         sz = sizeof(sd_id128_t) + l;
1701         v = alloca(sz);
1702
1703         /* fetch some persistent data unique to the host */
1704         r = sd_id128_get_machine((sd_id128_t*) v);
1705         if (r < 0)
1706                 return r;
1707
1708         /* combine with some data unique (on this host) to this
1709          * container instance */
1710         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1711
1712         /* Let's hash the host machine ID plus the container name. We
1713          * use a fixed, but originally randomly created hash key here. */
1714         siphash24(result, v, sz, hash_key.bytes);
1715
1716         assert_cc(ETH_ALEN <= sizeof(result));
1717         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1718
1719         /* see eth_random_addr in the kernel */
1720         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1721         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1722
1723         return 0;
1724 }
1725
1726 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1727         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1728         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1729         struct ether_addr mac_host, mac_container;
1730         int r, i;
1731
1732         if (!arg_private_network)
1733                 return 0;
1734
1735         if (!arg_network_veth)
1736                 return 0;
1737
1738         /* Use two different interface name prefixes depending whether
1739          * we are in bridge mode or not. */
1740         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1741                  arg_network_bridge ? "vb" : "ve", arg_machine);
1742
1743         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1744         if (r < 0) {
1745                 log_error("Failed to generate predictable MAC address for container side");
1746                 return r;
1747         }
1748
1749         r = generate_mac(&mac_host, HOST_HASH_KEY);
1750         if (r < 0) {
1751                 log_error("Failed to generate predictable MAC address for host side");
1752                 return r;
1753         }
1754
1755         r = sd_rtnl_open(&rtnl, 0);
1756         if (r < 0) {
1757                 log_error("Failed to connect to netlink: %s", strerror(-r));
1758                 return r;
1759         }
1760
1761         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1762         if (r < 0) {
1763                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1764                 return r;
1765         }
1766
1767         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1768         if (r < 0) {
1769                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1770                 return r;
1771         }
1772
1773         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1774         if (r < 0) {
1775                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1776                 return r;
1777         }
1778
1779         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1780         if (r < 0) {
1781                 log_error("Failed to open netlink container: %s", strerror(-r));
1782                 return r;
1783         }
1784
1785         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1786         if (r < 0) {
1787                 log_error("Failed to open netlink container: %s", strerror(-r));
1788                 return r;
1789         }
1790
1791         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1792         if (r < 0) {
1793                 log_error("Failed to open netlink container: %s", strerror(-r));
1794                 return r;
1795         }
1796
1797         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1798         if (r < 0) {
1799                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1800                 return r;
1801         }
1802
1803         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1804         if (r < 0) {
1805                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1806                 return r;
1807         }
1808
1809         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1810         if (r < 0) {
1811                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1812                 return r;
1813         }
1814
1815         r = sd_rtnl_message_close_container(m);
1816         if (r < 0) {
1817                 log_error("Failed to close netlink container: %s", strerror(-r));
1818                 return r;
1819         }
1820
1821         r = sd_rtnl_message_close_container(m);
1822         if (r < 0) {
1823                 log_error("Failed to close netlink container: %s", strerror(-r));
1824                 return r;
1825         }
1826
1827         r = sd_rtnl_message_close_container(m);
1828         if (r < 0) {
1829                 log_error("Failed to close netlink container: %s", strerror(-r));
1830                 return r;
1831         }
1832
1833         r = sd_rtnl_call(rtnl, m, 0, NULL);
1834         if (r < 0) {
1835                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1836                 return r;
1837         }
1838
1839         i = (int) if_nametoindex(iface_name);
1840         if (i <= 0) {
1841                 log_error("Failed to resolve interface %s: %m", iface_name);
1842                 return -errno;
1843         }
1844
1845         *ifi = i;
1846
1847         return 0;
1848 }
1849
1850 static int setup_bridge(const char veth_name[], int *ifi) {
1851         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1852         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1853         int r, bridge;
1854
1855         if (!arg_private_network)
1856                 return 0;
1857
1858         if (!arg_network_veth)
1859                 return 0;
1860
1861         if (!arg_network_bridge)
1862                 return 0;
1863
1864         bridge = (int) if_nametoindex(arg_network_bridge);
1865         if (bridge <= 0) {
1866                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1867                 return -errno;
1868         }
1869
1870         *ifi = bridge;
1871
1872         r = sd_rtnl_open(&rtnl, 0);
1873         if (r < 0) {
1874                 log_error("Failed to connect to netlink: %s", strerror(-r));
1875                 return r;
1876         }
1877
1878         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1879         if (r < 0) {
1880                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1881                 return r;
1882         }
1883
1884         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1885         if (r < 0) {
1886                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1887                 return r;
1888         }
1889
1890         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1891         if (r < 0) {
1892                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1893                 return r;
1894         }
1895
1896         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1897         if (r < 0) {
1898                 log_error("Failed to add netlink master field: %s", strerror(-r));
1899                 return r;
1900         }
1901
1902         r = sd_rtnl_call(rtnl, m, 0, NULL);
1903         if (r < 0) {
1904                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1905                 return r;
1906         }
1907
1908         return 0;
1909 }
1910
1911 static int parse_interface(struct udev *udev, const char *name) {
1912         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1913         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1914         int ifi;
1915
1916         ifi = (int) if_nametoindex(name);
1917         if (ifi <= 0) {
1918                 log_error("Failed to resolve interface %s: %m", name);
1919                 return -errno;
1920         }
1921
1922         sprintf(ifi_str, "n%i", ifi);
1923         d = udev_device_new_from_device_id(udev, ifi_str);
1924         if (!d) {
1925                 log_error("Failed to get udev device for interface %s: %m", name);
1926                 return -errno;
1927         }
1928
1929         if (udev_device_get_is_initialized(d) <= 0) {
1930                 log_error("Network interface %s is not initialized yet.", name);
1931                 return -EBUSY;
1932         }
1933
1934         return ifi;
1935 }
1936
1937 static int move_network_interfaces(pid_t pid) {
1938         _cleanup_udev_unref_ struct udev *udev = NULL;
1939         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1940         char **i;
1941         int r;
1942
1943         if (!arg_private_network)
1944                 return 0;
1945
1946         if (strv_isempty(arg_network_interfaces))
1947                 return 0;
1948
1949         r = sd_rtnl_open(&rtnl, 0);
1950         if (r < 0) {
1951                 log_error("Failed to connect to netlink: %s", strerror(-r));
1952                 return r;
1953         }
1954
1955         udev = udev_new();
1956         if (!udev) {
1957                 log_error("Failed to connect to udev.");
1958                 return -ENOMEM;
1959         }
1960
1961         STRV_FOREACH(i, arg_network_interfaces) {
1962                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1963                 int ifi;
1964
1965                 ifi = parse_interface(udev, *i);
1966                 if (ifi < 0)
1967                         return ifi;
1968
1969                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1970                 if (r < 0) {
1971                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1972                         return r;
1973                 }
1974
1975                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1976                 if (r < 0) {
1977                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1978                         return r;
1979                 }
1980
1981                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1982                 if (r < 0) {
1983                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1984                         return r;
1985                 }
1986         }
1987
1988         return 0;
1989 }
1990
1991 static int setup_macvlan(pid_t pid) {
1992         _cleanup_udev_unref_ struct udev *udev = NULL;
1993         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1994         char **i;
1995         int r;
1996
1997         if (!arg_private_network)
1998                 return 0;
1999
2000         if (strv_isempty(arg_network_macvlan))
2001                 return 0;
2002
2003         r = sd_rtnl_open(&rtnl, 0);
2004         if (r < 0) {
2005                 log_error("Failed to connect to netlink: %s", strerror(-r));
2006                 return r;
2007         }
2008
2009         udev = udev_new();
2010         if (!udev) {
2011                 log_error("Failed to connect to udev.");
2012                 return -ENOMEM;
2013         }
2014
2015         STRV_FOREACH(i, arg_network_macvlan) {
2016                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2017                 _cleanup_free_ char *n = NULL;
2018                 int ifi;
2019
2020                 ifi = parse_interface(udev, *i);
2021                 if (ifi < 0)
2022                         return ifi;
2023
2024                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2025                 if (r < 0) {
2026                         log_error("Failed to allocate netlink message: %s", strerror(-r));
2027                         return r;
2028                 }
2029
2030                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2031                 if (r < 0) {
2032                         log_error("Failed to add netlink interface index: %s", strerror(-r));
2033                         return r;
2034                 }
2035
2036                 n = strappend("mv-", *i);
2037                 if (!n)
2038                         return log_oom();
2039
2040                 strshorten(n, IFNAMSIZ-1);
2041
2042                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2043                 if (r < 0) {
2044                         log_error("Failed to add netlink interface name: %s", strerror(-r));
2045                         return r;
2046                 }
2047
2048                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2049                 if (r < 0) {
2050                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
2051                         return r;
2052                 }
2053
2054                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2055                 if (r < 0) {
2056                         log_error("Failed to open netlink container: %s", strerror(-r));
2057                         return r;
2058                 }
2059
2060                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2061                 if (r < 0) {
2062                         log_error("Failed to open netlink container: %s", strerror(-r));
2063                         return r;
2064                 }
2065
2066                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2067                 if (r < 0) {
2068                         log_error("Failed to append macvlan mode: %s", strerror(-r));
2069                         return r;
2070                 }
2071
2072                 r = sd_rtnl_message_close_container(m);
2073                 if (r < 0) {
2074                         log_error("Failed to close netlink container: %s", strerror(-r));
2075                         return r;
2076                 }
2077
2078                 r = sd_rtnl_message_close_container(m);
2079                 if (r < 0) {
2080                         log_error("Failed to close netlink container: %s", strerror(-r));
2081                         return r;
2082                 }
2083
2084                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2085                 if (r < 0) {
2086                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2087                         return r;
2088                 }
2089         }
2090
2091         return 0;
2092 }
2093
2094 static int setup_seccomp(void) {
2095
2096 #ifdef HAVE_SECCOMP
2097         static const int blacklist[] = {
2098                 SCMP_SYS(kexec_load),
2099                 SCMP_SYS(open_by_handle_at),
2100                 SCMP_SYS(init_module),
2101                 SCMP_SYS(finit_module),
2102                 SCMP_SYS(delete_module),
2103                 SCMP_SYS(iopl),
2104                 SCMP_SYS(ioperm),
2105                 SCMP_SYS(swapon),
2106                 SCMP_SYS(swapoff),
2107         };
2108
2109         scmp_filter_ctx seccomp;
2110         unsigned i;
2111         int r;
2112
2113         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2114         if (!seccomp)
2115                 return log_oom();
2116
2117         r = seccomp_add_secondary_archs(seccomp);
2118         if (r < 0) {
2119                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2120                 goto finish;
2121         }
2122
2123         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2124                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2125                 if (r == -EFAULT)
2126                         continue; /* unknown syscall */
2127                 if (r < 0) {
2128                         log_error("Failed to block syscall: %s", strerror(-r));
2129                         goto finish;
2130                 }
2131         }
2132
2133         /*
2134            Audit is broken in containers, much of the userspace audit
2135            hookup will fail if running inside a container. We don't
2136            care and just turn off creation of audit sockets.
2137
2138            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2139            with EAFNOSUPPORT which audit userspace uses as indication
2140            that audit is disabled in the kernel.
2141          */
2142
2143         r = seccomp_rule_add(
2144                         seccomp,
2145                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2146                         SCMP_SYS(socket),
2147                         2,
2148                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2149                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2150         if (r < 0) {
2151                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2152                 goto finish;
2153         }
2154
2155         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2156         if (r < 0) {
2157                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2158                 goto finish;
2159         }
2160
2161         r = seccomp_load(seccomp);
2162         if (r < 0)
2163                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2164
2165 finish:
2166         seccomp_release(seccomp);
2167         return r;
2168 #else
2169         return 0;
2170 #endif
2171
2172 }
2173
2174 static int setup_image(char **device_path, int *loop_nr) {
2175         struct loop_info64 info = {
2176                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2177         };
2178         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2179         _cleanup_free_ char* loopdev = NULL;
2180         struct stat st;
2181         int r, nr;
2182
2183         assert(device_path);
2184         assert(loop_nr);
2185
2186         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2187         if (fd < 0) {
2188                 log_error("Failed to open %s: %m", arg_image);
2189                 return -errno;
2190         }
2191
2192         if (fstat(fd, &st) < 0) {
2193                 log_error("Failed to stat %s: %m", arg_image);
2194                 return -errno;
2195         }
2196
2197         if (S_ISBLK(st.st_mode)) {
2198                 char *p;
2199
2200                 p = strdup(arg_image);
2201                 if (!p)
2202                         return log_oom();
2203
2204                 *device_path = p;
2205
2206                 *loop_nr = -1;
2207
2208                 r = fd;
2209                 fd = -1;
2210
2211                 return r;
2212         }
2213
2214         if (!S_ISREG(st.st_mode)) {
2215                 log_error("%s is not a regular file or block device: %m", arg_image);
2216                 return -EINVAL;
2217         }
2218
2219         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2220         if (control < 0) {
2221                 log_error("Failed to open /dev/loop-control: %m");
2222                 return -errno;
2223         }
2224
2225         nr = ioctl(control, LOOP_CTL_GET_FREE);
2226         if (nr < 0) {
2227                 log_error("Failed to allocate loop device: %m");
2228                 return -errno;
2229         }
2230
2231         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2232                 return log_oom();
2233
2234         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2235         if (loop < 0) {
2236                 log_error("Failed to open loop device %s: %m", loopdev);
2237                 return -errno;
2238         }
2239
2240         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2241                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2242                 return -errno;
2243         }
2244
2245         if (arg_read_only)
2246                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2247
2248         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2249                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2250                 return -errno;
2251         }
2252
2253         *device_path = loopdev;
2254         loopdev = NULL;
2255
2256         *loop_nr = nr;
2257
2258         r = loop;
2259         loop = -1;
2260
2261         return r;
2262 }
2263
2264 static int dissect_image(
2265                 int fd,
2266                 char **root_device, bool *root_device_rw,
2267                 char **home_device, bool *home_device_rw,
2268                 char **srv_device, bool *srv_device_rw,
2269                 bool *secondary) {
2270
2271 #ifdef HAVE_BLKID
2272         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2273         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2274         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2275         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2276         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2277         _cleanup_udev_unref_ struct udev *udev = NULL;
2278         struct udev_list_entry *first, *item;
2279         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2280         const char *pttype = NULL;
2281         blkid_partlist pl;
2282         struct stat st;
2283         int r;
2284
2285         assert(fd >= 0);
2286         assert(root_device);
2287         assert(home_device);
2288         assert(srv_device);
2289         assert(secondary);
2290
2291         b = blkid_new_probe();
2292         if (!b)
2293                 return log_oom();
2294
2295         errno = 0;
2296         r = blkid_probe_set_device(b, fd, 0, 0);
2297         if (r != 0) {
2298                 if (errno == 0)
2299                         return log_oom();
2300
2301                 log_error("Failed to set device on blkid probe: %m");
2302                 return -errno;
2303         }
2304
2305         blkid_probe_enable_partitions(b, 1);
2306         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2307
2308         errno = 0;
2309         r = blkid_do_safeprobe(b);
2310         if (r == -2 || r == 1) {
2311                 log_error("Failed to identify any partition table on %s.\n"
2312                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2313                 return -EINVAL;
2314         } else if (r != 0) {
2315                 if (errno == 0)
2316                         errno = EIO;
2317                 log_error("Failed to probe: %m");
2318                 return -errno;
2319         }
2320
2321         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2322         if (!streq_ptr(pttype, "gpt")) {
2323                 log_error("Image %s does not carry a GUID Partition Table.\n"
2324                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2325                 return -EINVAL;
2326         }
2327
2328         errno = 0;
2329         pl = blkid_probe_get_partitions(b);
2330         if (!pl) {
2331                 if (errno == 0)
2332                         return log_oom();
2333
2334                 log_error("Failed to list partitions of %s", arg_image);
2335                 return -errno;
2336         }
2337
2338         udev = udev_new();
2339         if (!udev)
2340                 return log_oom();
2341
2342         if (fstat(fd, &st) < 0) {
2343                 log_error("Failed to stat block device: %m");
2344                 return -errno;
2345         }
2346
2347         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2348         if (!d)
2349                 return log_oom();
2350
2351         e = udev_enumerate_new(udev);
2352         if (!e)
2353                 return log_oom();
2354
2355         r = udev_enumerate_add_match_parent(e, d);
2356         if (r < 0)
2357                 return log_oom();
2358
2359         r = udev_enumerate_scan_devices(e);
2360         if (r < 0) {
2361                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2362                 return r;
2363         }
2364
2365         first = udev_enumerate_get_list_entry(e);
2366         udev_list_entry_foreach(item, first) {
2367                 _cleanup_udev_device_unref_ struct udev_device *q;
2368                 const char *stype, *node;
2369                 unsigned long long flags;
2370                 sd_id128_t type_id;
2371                 blkid_partition pp;
2372                 dev_t qn;
2373                 int nr;
2374
2375                 errno = 0;
2376                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2377                 if (!q) {
2378                         if (!errno)
2379                                 errno = ENOMEM;
2380
2381                         log_error("Failed to get partition device of %s: %m", arg_image);
2382                         return -errno;
2383                 }
2384
2385                 qn = udev_device_get_devnum(q);
2386                 if (major(qn) == 0)
2387                         continue;
2388
2389                 if (st.st_rdev == qn)
2390                         continue;
2391
2392                 node = udev_device_get_devnode(q);
2393                 if (!node)
2394                         continue;
2395
2396                 pp = blkid_partlist_devno_to_partition(pl, qn);
2397                 if (!pp)
2398                         continue;
2399
2400                 flags = blkid_partition_get_flags(pp);
2401                 if (flags & GPT_FLAG_NO_AUTO)
2402                         continue;
2403
2404                 nr = blkid_partition_get_partno(pp);
2405                 if (nr < 0)
2406                         continue;
2407
2408                 stype = blkid_partition_get_type_string(pp);
2409                 if (!stype)
2410                         continue;
2411
2412                 if (sd_id128_from_string(stype, &type_id) < 0)
2413                         continue;
2414
2415                 if (sd_id128_equal(type_id, GPT_HOME)) {
2416
2417                         if (home && nr >= home_nr)
2418                                 continue;
2419
2420                         home_nr = nr;
2421                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2422
2423                         free(home);
2424                         home = strdup(node);
2425                         if (!home)
2426                                 return log_oom();
2427                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2428
2429                         if (srv && nr >= srv_nr)
2430                                 continue;
2431
2432                         srv_nr = nr;
2433                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2434
2435                         free(srv);
2436                         srv = strdup(node);
2437                         if (!srv)
2438                                 return log_oom();
2439                 }
2440 #ifdef GPT_ROOT_NATIVE
2441                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2442
2443                         if (root && nr >= root_nr)
2444                                 continue;
2445
2446                         root_nr = nr;
2447                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2448
2449                         free(root);
2450                         root = strdup(node);
2451                         if (!root)
2452                                 return log_oom();
2453                 }
2454 #endif
2455 #ifdef GPT_ROOT_SECONDARY
2456                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2457
2458                         if (secondary_root && nr >= secondary_root_nr)
2459                                 continue;
2460
2461                         secondary_root_nr = nr;
2462                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2463
2464
2465                         free(secondary_root);
2466                         secondary_root = strdup(node);
2467                         if (!secondary_root)
2468                                 return log_oom();
2469                 }
2470 #endif
2471         }
2472
2473         if (!root && !secondary_root) {
2474                 log_error("Failed to identify root partition in disk image %s.\n"
2475                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2476                 return -EINVAL;
2477         }
2478
2479         if (root) {
2480                 *root_device = root;
2481                 root = NULL;
2482
2483                 *root_device_rw = root_rw;
2484                 *secondary = false;
2485         } else if (secondary_root) {
2486                 *root_device = secondary_root;
2487                 secondary_root = NULL;
2488
2489                 *root_device_rw = secondary_root_rw;
2490                 *secondary = true;
2491         }
2492
2493         if (home) {
2494                 *home_device = home;
2495                 home = NULL;
2496
2497                 *home_device_rw = home_rw;
2498         }
2499
2500         if (srv) {
2501                 *srv_device = srv;
2502                 srv = NULL;
2503
2504                 *srv_device_rw = srv_rw;
2505         }
2506
2507         return 0;
2508 #else
2509         log_error("--image= is not supported, compiled without blkid support.");
2510         return -ENOTSUP;
2511 #endif
2512 }
2513
2514 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2515 #ifdef HAVE_BLKID
2516         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2517         const char *fstype, *p;
2518         int r;
2519
2520         assert(what);
2521         assert(where);
2522
2523         if (arg_read_only)
2524                 rw = false;
2525
2526         if (directory)
2527                 p = strappenda(where, directory);
2528         else
2529                 p = where;
2530
2531         errno = 0;
2532         b = blkid_new_probe_from_filename(what);
2533         if (!b) {
2534                 if (errno == 0)
2535                         return log_oom();
2536                 log_error("Failed to allocate prober for %s: %m", what);
2537                 return -errno;
2538         }
2539
2540         blkid_probe_enable_superblocks(b, 1);
2541         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2542
2543         errno = 0;
2544         r = blkid_do_safeprobe(b);
2545         if (r == -1 || r == 1) {
2546                 log_error("Cannot determine file system type of %s", what);
2547                 return -EINVAL;
2548         } else if (r != 0) {
2549                 if (errno == 0)
2550                         errno = EIO;
2551                 log_error("Failed to probe %s: %m", what);
2552                 return -errno;
2553         }
2554
2555         errno = 0;
2556         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2557                 if (errno == 0)
2558                         errno = EINVAL;
2559                 log_error("Failed to determine file system type of %s", what);
2560                 return -errno;
2561         }
2562
2563         if (streq(fstype, "crypto_LUKS")) {
2564                 log_error("nspawn currently does not support LUKS disk images.");
2565                 return -ENOTSUP;
2566         }
2567
2568         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2569                 log_error("Failed to mount %s: %m", what);
2570                 return -errno;
2571         }
2572
2573         return 0;
2574 #else
2575         log_error("--image= is not supported, compiled without blkid support.");
2576         return -ENOTSUP;
2577 #endif
2578 }
2579
2580 static int mount_devices(
2581                 const char *where,
2582                 const char *root_device, bool root_device_rw,
2583                 const char *home_device, bool home_device_rw,
2584                 const char *srv_device, bool srv_device_rw) {
2585         int r;
2586
2587         assert(where);
2588
2589         if (root_device) {
2590                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2591                 if (r < 0) {
2592                         log_error("Failed to mount root directory: %s", strerror(-r));
2593                         return r;
2594                 }
2595         }
2596
2597         if (home_device) {
2598                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2599                 if (r < 0) {
2600                         log_error("Failed to mount home directory: %s", strerror(-r));
2601                         return r;
2602                 }
2603         }
2604
2605         if (srv_device) {
2606                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2607                 if (r < 0) {
2608                         log_error("Failed to mount server data directory: %s", strerror(-r));
2609                         return r;
2610                 }
2611         }
2612
2613         return 0;
2614 }
2615
2616 static void loop_remove(int nr, int *image_fd) {
2617         _cleanup_close_ int control = -1;
2618         int r;
2619
2620         if (nr < 0)
2621                 return;
2622
2623         if (image_fd && *image_fd >= 0) {
2624                 r = ioctl(*image_fd, LOOP_CLR_FD);
2625                 if (r < 0)
2626                         log_warning("Failed to close loop image: %m");
2627                 *image_fd = safe_close(*image_fd);
2628         }
2629
2630         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2631         if (control < 0) {
2632                 log_warning("Failed to open /dev/loop-control: %m");
2633                 return;
2634         }
2635
2636         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2637         if (r < 0)
2638                 log_warning("Failed to remove loop %d: %m", nr);
2639 }
2640
2641 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2642         int pipe_fds[2];
2643         pid_t pid;
2644
2645         assert(database);
2646         assert(key);
2647         assert(rpid);
2648
2649         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2650                 log_error("Failed to allocate pipe: %m");
2651                 return -errno;
2652         }
2653
2654         pid = fork();
2655         if (pid < 0) {
2656                 log_error("Failed to fork getent child: %m");
2657                 return -errno;
2658         } else if (pid == 0) {
2659                 int nullfd;
2660                 char *empty_env = NULL;
2661
2662                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2663                         _exit(EXIT_FAILURE);
2664
2665                 if (pipe_fds[0] > 2)
2666                         safe_close(pipe_fds[0]);
2667                 if (pipe_fds[1] > 2)
2668                         safe_close(pipe_fds[1]);
2669
2670                 nullfd = open("/dev/null", O_RDWR);
2671                 if (nullfd < 0)
2672                         _exit(EXIT_FAILURE);
2673
2674                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2675                         _exit(EXIT_FAILURE);
2676
2677                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2678                         _exit(EXIT_FAILURE);
2679
2680                 if (nullfd > 2)
2681                         safe_close(nullfd);
2682
2683                 reset_all_signal_handlers();
2684                 close_all_fds(NULL, 0);
2685
2686                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2687                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2688                 _exit(EXIT_FAILURE);
2689         }
2690
2691         pipe_fds[1] = safe_close(pipe_fds[1]);
2692
2693         *rpid = pid;
2694
2695         return pipe_fds[0];
2696 }
2697
2698 static int change_uid_gid(char **_home) {
2699         char line[LINE_MAX], *x, *u, *g, *h;
2700         const char *word, *state;
2701         _cleanup_free_ uid_t *uids = NULL;
2702         _cleanup_free_ char *home = NULL;
2703         _cleanup_fclose_ FILE *f = NULL;
2704         _cleanup_close_ int fd = -1;
2705         unsigned n_uids = 0;
2706         size_t sz = 0, l;
2707         uid_t uid;
2708         gid_t gid;
2709         pid_t pid;
2710         int r;
2711
2712         assert(_home);
2713
2714         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2715                 /* Reset everything fully to 0, just in case */
2716
2717                 if (setgroups(0, NULL) < 0) {
2718                         log_error("setgroups() failed: %m");
2719                         return -errno;
2720                 }
2721
2722                 if (setresgid(0, 0, 0) < 0) {
2723                         log_error("setregid() failed: %m");
2724                         return -errno;
2725                 }
2726
2727                 if (setresuid(0, 0, 0) < 0) {
2728                         log_error("setreuid() failed: %m");
2729                         return -errno;
2730                 }
2731
2732                 *_home = NULL;
2733                 return 0;
2734         }
2735
2736         /* First, get user credentials */
2737         fd = spawn_getent("passwd", arg_user, &pid);
2738         if (fd < 0)
2739                 return fd;
2740
2741         f = fdopen(fd, "r");
2742         if (!f)
2743                 return log_oom();
2744         fd = -1;
2745
2746         if (!fgets(line, sizeof(line), f)) {
2747
2748                 if (!ferror(f)) {
2749                         log_error("Failed to resolve user %s.", arg_user);
2750                         return -ESRCH;
2751                 }
2752
2753                 log_error("Failed to read from getent: %m");
2754                 return -errno;
2755         }
2756
2757         truncate_nl(line);
2758
2759         wait_for_terminate_and_warn("getent passwd", pid);
2760
2761         x = strchr(line, ':');
2762         if (!x) {
2763                 log_error("/etc/passwd entry has invalid user field.");
2764                 return -EIO;
2765         }
2766
2767         u = strchr(x+1, ':');
2768         if (!u) {
2769                 log_error("/etc/passwd entry has invalid password field.");
2770                 return -EIO;
2771         }
2772
2773         u++;
2774         g = strchr(u, ':');
2775         if (!g) {
2776                 log_error("/etc/passwd entry has invalid UID field.");
2777                 return -EIO;
2778         }
2779
2780         *g = 0;
2781         g++;
2782         x = strchr(g, ':');
2783         if (!x) {
2784                 log_error("/etc/passwd entry has invalid GID field.");
2785                 return -EIO;
2786         }
2787
2788         *x = 0;
2789         h = strchr(x+1, ':');
2790         if (!h) {
2791                 log_error("/etc/passwd entry has invalid GECOS field.");
2792                 return -EIO;
2793         }
2794
2795         h++;
2796         x = strchr(h, ':');
2797         if (!x) {
2798                 log_error("/etc/passwd entry has invalid home directory field.");
2799                 return -EIO;
2800         }
2801
2802         *x = 0;
2803
2804         r = parse_uid(u, &uid);
2805         if (r < 0) {
2806                 log_error("Failed to parse UID of user.");
2807                 return -EIO;
2808         }
2809
2810         r = parse_gid(g, &gid);
2811         if (r < 0) {
2812                 log_error("Failed to parse GID of user.");
2813                 return -EIO;
2814         }
2815
2816         home = strdup(h);
2817         if (!home)
2818                 return log_oom();
2819
2820         /* Second, get group memberships */
2821         fd = spawn_getent("initgroups", arg_user, &pid);
2822         if (fd < 0)
2823                 return fd;
2824
2825         fclose(f);
2826         f = fdopen(fd, "r");
2827         if (!f)
2828                 return log_oom();
2829         fd = -1;
2830
2831         if (!fgets(line, sizeof(line), f)) {
2832                 if (!ferror(f)) {
2833                         log_error("Failed to resolve user %s.", arg_user);
2834                         return -ESRCH;
2835                 }
2836
2837                 log_error("Failed to read from getent: %m");
2838                 return -errno;
2839         }
2840
2841         truncate_nl(line);
2842
2843         wait_for_terminate_and_warn("getent initgroups", pid);
2844
2845         /* Skip over the username and subsequent separator whitespace */
2846         x = line;
2847         x += strcspn(x, WHITESPACE);
2848         x += strspn(x, WHITESPACE);
2849
2850         FOREACH_WORD(word, l, x, state) {
2851                 char c[l+1];
2852
2853                 memcpy(c, word, l);
2854                 c[l] = 0;
2855
2856                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2857                         return log_oom();
2858
2859                 r = parse_uid(c, &uids[n_uids++]);
2860                 if (r < 0) {
2861                         log_error("Failed to parse group data from getent.");
2862                         return -EIO;
2863                 }
2864         }
2865
2866         r = mkdir_parents(home, 0775);
2867         if (r < 0) {
2868                 log_error("Failed to make home root directory: %s", strerror(-r));
2869                 return r;
2870         }
2871
2872         r = mkdir_safe(home, 0755, uid, gid);
2873         if (r < 0 && r != -EEXIST) {
2874                 log_error("Failed to make home directory: %s", strerror(-r));
2875                 return r;
2876         }
2877
2878         fchown(STDIN_FILENO, uid, gid);
2879         fchown(STDOUT_FILENO, uid, gid);
2880         fchown(STDERR_FILENO, uid, gid);
2881
2882         if (setgroups(n_uids, uids) < 0) {
2883                 log_error("Failed to set auxiliary groups: %m");
2884                 return -errno;
2885         }
2886
2887         if (setresgid(gid, gid, gid) < 0) {
2888                 log_error("setregid() failed: %m");
2889                 return -errno;
2890         }
2891
2892         if (setresuid(uid, uid, uid) < 0) {
2893                 log_error("setreuid() failed: %m");
2894                 return -errno;
2895         }
2896
2897         if (_home) {
2898                 *_home = home;
2899                 home = NULL;
2900         }
2901
2902         return 0;
2903 }
2904
2905 /*
2906  * Return values:
2907  * < 0 : wait_for_terminate() failed to get the state of the
2908  *       container, the container was terminated by a signal, or
2909  *       failed for an unknown reason.  No change is made to the
2910  *       container argument.
2911  * > 0 : The program executed in the container terminated with an
2912  *       error.  The exit code of the program executed in the
2913  *       container is returned.  No change is made to the container
2914  *       argument.
2915  *   0 : The container is being rebooted, has been shut down or exited
2916  *       successfully.  The container argument has been set to either
2917  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2918  *
2919  * That is, success is indicated by a return value of zero, and an
2920  * error is indicated by a non-zero value.
2921  */
2922 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2923         int r;
2924         siginfo_t status;
2925
2926         r = wait_for_terminate(pid, &status);
2927         if (r < 0) {
2928                 log_warning("Failed to wait for container: %s", strerror(-r));
2929                 return r;
2930         }
2931
2932         switch (status.si_code) {
2933         case CLD_EXITED:
2934                 r = status.si_status;
2935                 if (r == 0) {
2936                         if (!arg_quiet)
2937                                 log_debug("Container %s exited successfully.",
2938                                           arg_machine);
2939
2940                         *container = CONTAINER_TERMINATED;
2941                 } else {
2942                         log_error("Container %s failed with error code %i.",
2943                                   arg_machine, status.si_status);
2944                 }
2945                 break;
2946
2947         case CLD_KILLED:
2948                 if (status.si_status == SIGINT) {
2949                         if (!arg_quiet)
2950                                 log_info("Container %s has been shut down.",
2951                                          arg_machine);
2952
2953                         *container = CONTAINER_TERMINATED;
2954                         r = 0;
2955                         break;
2956                 } else if (status.si_status == SIGHUP) {
2957                         if (!arg_quiet)
2958                                 log_info("Container %s is being rebooted.",
2959                                          arg_machine);
2960
2961                         *container = CONTAINER_REBOOTED;
2962                         r = 0;
2963                         break;
2964                 }
2965                 /* CLD_KILLED fallthrough */
2966
2967         case CLD_DUMPED:
2968                 log_error("Container %s terminated by signal %s.",
2969                           arg_machine, signal_to_string(status.si_status));
2970                 r = -1;
2971                 break;
2972
2973         default:
2974                 log_error("Container %s failed due to unknown reason.",
2975                           arg_machine);
2976                 r = -1;
2977                 break;
2978         }
2979
2980         return r;
2981 }
2982
2983 static void nop_handler(int sig) {}
2984
2985 int main(int argc, char *argv[]) {
2986
2987         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2988         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2989         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2990         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2991         _cleanup_fdset_free_ FDSet *fds = NULL;
2992         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2993         const char *console = NULL;
2994         char veth_name[IFNAMSIZ];
2995         bool secondary = false;
2996         sigset_t mask, mask_chld;
2997         pid_t pid = 0;
2998
2999         log_parse_environment();
3000         log_open();
3001
3002         k = parse_argv(argc, argv);
3003         if (k < 0)
3004                 goto finish;
3005         else if (k == 0) {
3006                 r = EXIT_SUCCESS;
3007                 goto finish;
3008         }
3009
3010         if (!arg_image) {
3011                 if (arg_directory) {
3012                         char *p;
3013
3014                         p = path_make_absolute_cwd(arg_directory);
3015                         free(arg_directory);
3016                         arg_directory = p;
3017                 } else
3018                         arg_directory = get_current_dir_name();
3019
3020                 if (!arg_directory) {
3021                         log_error("Failed to determine path, please use -D.");
3022                         goto finish;
3023                 }
3024                 path_kill_slashes(arg_directory);
3025         }
3026
3027         if (!arg_machine) {
3028                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3029                 if (!arg_machine) {
3030                         log_oom();
3031                         goto finish;
3032                 }
3033
3034                 hostname_cleanup(arg_machine, false);
3035                 if (isempty(arg_machine)) {
3036                         log_error("Failed to determine machine name automatically, please use -M.");
3037                         goto finish;
3038                 }
3039         }
3040
3041         if (geteuid() != 0) {
3042                 log_error("Need to be root.");
3043                 goto finish;
3044         }
3045
3046         if (sd_booted() <= 0) {
3047                 log_error("Not running on a systemd system.");
3048                 goto finish;
3049         }
3050
3051         log_close();
3052         n_fd_passed = sd_listen_fds(false);
3053         if (n_fd_passed > 0) {
3054                 k = fdset_new_listen_fds(&fds, false);
3055                 if (k < 0) {
3056                         log_error("Failed to collect file descriptors: %s", strerror(-k));
3057                         goto finish;
3058                 }
3059         }
3060         fdset_close_others(fds);
3061         log_open();
3062
3063         if (arg_directory) {
3064                 if (path_equal(arg_directory, "/")) {
3065                         log_error("Spawning container on root directory not supported.");
3066                         goto finish;
3067                 }
3068
3069                 if (arg_boot) {
3070                         if (path_is_os_tree(arg_directory) <= 0) {
3071                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3072                                 goto finish;
3073                         }
3074                 } else {
3075                         const char *p;
3076
3077                         p = strappenda(arg_directory,
3078                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3079                         if (access(p, F_OK) < 0) {
3080                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3081                                 goto finish;
3082
3083                         }
3084                 }
3085         } else {
3086                 char template[] = "/tmp/nspawn-root-XXXXXX";
3087
3088                 if (!mkdtemp(template)) {
3089                         log_error("Failed to create temporary directory: %m");
3090                         r = -errno;
3091                         goto finish;
3092                 }
3093
3094                 arg_directory = strdup(template);
3095                 if (!arg_directory) {
3096                         r = log_oom();
3097                         goto finish;
3098                 }
3099
3100                 image_fd = setup_image(&device_path, &loop_nr);
3101                 if (image_fd < 0) {
3102                         r = image_fd;
3103                         goto finish;
3104                 }
3105
3106                 r = dissect_image(image_fd,
3107                                   &root_device, &root_device_rw,
3108                                   &home_device, &home_device_rw,
3109                                   &srv_device, &srv_device_rw,
3110                                   &secondary);
3111                 if (r < 0)
3112                         goto finish;
3113         }
3114
3115         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3116         if (master < 0) {
3117                 log_error("Failed to acquire pseudo tty: %m");
3118                 goto finish;
3119         }
3120
3121         console = ptsname(master);
3122         if (!console) {
3123                 log_error("Failed to determine tty name: %m");
3124                 goto finish;
3125         }
3126
3127         if (!arg_quiet)
3128                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3129                          arg_machine, arg_image ? arg_image : arg_directory);
3130
3131         if (unlockpt(master) < 0) {
3132                 log_error("Failed to unlock tty: %m");
3133                 goto finish;
3134         }
3135
3136         if (access("/dev/kdbus/control", F_OK) >= 0) {
3137
3138                 if (arg_share_system) {
3139                         kdbus_domain = strdup("/dev/kdbus");
3140                         if (!kdbus_domain) {
3141                                 log_oom();
3142                                 goto finish;
3143                         }
3144                 } else {
3145                         const char *ns;
3146
3147                         ns = strappenda("machine-", arg_machine);
3148                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3149                         if (r < 0)
3150                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3151                         else
3152                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3153                 }
3154         }
3155
3156         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3157                 log_error("Failed to create kmsg socket pair: %m");
3158                 goto finish;
3159         }
3160
3161         sd_notify(false,
3162                   "READY=1\n"
3163                   "STATUS=Container running.");
3164
3165         assert_se(sigemptyset(&mask) == 0);
3166         assert_se(sigemptyset(&mask_chld) == 0);
3167         sigaddset(&mask_chld, SIGCHLD);
3168         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3169         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3170
3171         for (;;) {
3172                 ContainerStatus container_status;
3173                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3174                 struct sigaction sa = {
3175                         .sa_handler = nop_handler,
3176                         .sa_flags = SA_NOCLDSTOP,
3177                 };
3178
3179                 r = barrier_create(&barrier);
3180                 if (r < 0) {
3181                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3182                         goto finish;
3183                 }
3184
3185                 /* Child can be killed before execv(), so handle SIGCHLD
3186                  * in order to interrupt parent's blocking calls and
3187                  * give it a chance to call wait() and terminate. */
3188                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3189                 if (r < 0) {
3190                         log_error("Failed to change the signal mask: %m");
3191                         goto finish;
3192                 }
3193
3194                 r = sigaction(SIGCHLD, &sa, NULL);
3195                 if (r < 0) {
3196                         log_error("Failed to install SIGCHLD handler: %m");
3197                         goto finish;
3198                 }
3199
3200                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3201                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3202                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3203                 if (pid < 0) {
3204                         if (errno == EINVAL)
3205                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3206                         else
3207                                 log_error("clone() failed: %m");
3208
3209                         r = pid;
3210                         goto finish;
3211                 }
3212
3213                 if (pid == 0) {
3214                         /* child */
3215                         _cleanup_free_ char *home = NULL;
3216                         unsigned n_env = 2;
3217                         const char *envp[] = {
3218                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3219                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3220                                 NULL, /* TERM */
3221                                 NULL, /* HOME */
3222                                 NULL, /* USER */
3223                                 NULL, /* LOGNAME */
3224                                 NULL, /* container_uuid */
3225                                 NULL, /* LISTEN_FDS */
3226                                 NULL, /* LISTEN_PID */
3227                                 NULL
3228                         };
3229                         char **env_use;
3230
3231                         barrier_set_role(&barrier, BARRIER_CHILD);
3232
3233                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3234                         if (envp[n_env])
3235                                 n_env ++;
3236
3237                         master = safe_close(master);
3238
3239                         close_nointr(STDIN_FILENO);
3240                         close_nointr(STDOUT_FILENO);
3241                         close_nointr(STDERR_FILENO);
3242
3243                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3244
3245                         reset_all_signal_handlers();
3246                         reset_signal_mask();
3247
3248                         k = open_terminal(console, O_RDWR);
3249                         if (k != STDIN_FILENO) {
3250                                 if (k >= 0) {
3251                                         safe_close(k);
3252                                         k = -EINVAL;
3253                                 }
3254
3255                                 log_error("Failed to open console: %s", strerror(-k));
3256                                 _exit(EXIT_FAILURE);
3257                         }
3258
3259                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3260                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3261                                 log_error("Failed to duplicate console: %m");
3262                                 _exit(EXIT_FAILURE);
3263                         }
3264
3265                         if (setsid() < 0) {
3266                                 log_error("setsid() failed: %m");
3267                                 _exit(EXIT_FAILURE);
3268                         }
3269
3270                         if (reset_audit_loginuid() < 0)
3271                                 _exit(EXIT_FAILURE);
3272
3273                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3274                                 log_error("PR_SET_PDEATHSIG failed: %m");
3275                                 _exit(EXIT_FAILURE);
3276                         }
3277
3278                         /* Mark everything as slave, so that we still
3279                          * receive mounts from the real root, but don't
3280                          * propagate mounts to the real root. */
3281                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3282                                 log_error("MS_SLAVE|MS_REC failed: %m");
3283                                 _exit(EXIT_FAILURE);
3284                         }
3285
3286                         if (mount_devices(arg_directory,
3287                                           root_device, root_device_rw,
3288                                           home_device, home_device_rw,
3289                                           srv_device, srv_device_rw) < 0)
3290                                 _exit(EXIT_FAILURE);
3291
3292                         /* Turn directory into bind mount */
3293                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3294                                 log_error("Failed to make bind mount: %m");
3295                                 _exit(EXIT_FAILURE);
3296                         }
3297
3298                         r = setup_volatile(arg_directory);
3299                         if (r < 0)
3300                                 _exit(EXIT_FAILURE);
3301
3302                         if (setup_volatile_state(arg_directory) < 0)
3303                                 _exit(EXIT_FAILURE);
3304
3305                         r = base_filesystem_create(arg_directory);
3306                         if (r < 0)
3307                                 _exit(EXIT_FAILURE);
3308
3309                         if (arg_read_only) {
3310                                 k = bind_remount_recursive(arg_directory, true);
3311                                 if (k < 0) {
3312                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3313                                         _exit(EXIT_FAILURE);
3314                                 }
3315                         }
3316
3317                         if (mount_all(arg_directory) < 0)
3318                                 _exit(EXIT_FAILURE);
3319
3320                         if (copy_devnodes(arg_directory) < 0)
3321                                 _exit(EXIT_FAILURE);
3322
3323                         if (setup_ptmx(arg_directory) < 0)
3324                                 _exit(EXIT_FAILURE);
3325
3326                         dev_setup(arg_directory);
3327
3328                         if (setup_seccomp() < 0)
3329                                 _exit(EXIT_FAILURE);
3330
3331                         if (setup_dev_console(arg_directory, console) < 0)
3332                                 _exit(EXIT_FAILURE);
3333
3334                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3335                                 _exit(EXIT_FAILURE);
3336
3337                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3338
3339                         if (setup_boot_id(arg_directory) < 0)
3340                                 _exit(EXIT_FAILURE);
3341
3342                         if (setup_timezone(arg_directory) < 0)
3343                                 _exit(EXIT_FAILURE);
3344
3345                         if (setup_resolv_conf(arg_directory) < 0)
3346                                 _exit(EXIT_FAILURE);
3347
3348                         if (setup_journal(arg_directory) < 0)
3349                                 _exit(EXIT_FAILURE);
3350
3351                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3352                                 _exit(EXIT_FAILURE);
3353
3354                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3355                                 _exit(EXIT_FAILURE);
3356
3357                         if (mount_tmpfs(arg_directory) < 0)
3358                                 _exit(EXIT_FAILURE);
3359
3360                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3361                                 _exit(EXIT_FAILURE);
3362
3363                         /* Tell the parent that we are ready, and that
3364                          * it can cgroupify us to that we lack access
3365                          * to certain devices and resources. */
3366                         barrier_place(&barrier);
3367
3368                         if (chdir(arg_directory) < 0) {
3369                                 log_error("chdir(%s) failed: %m", arg_directory);
3370                                 _exit(EXIT_FAILURE);
3371                         }
3372
3373                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3374                                 log_error("mount(MS_MOVE) failed: %m");
3375                                 _exit(EXIT_FAILURE);
3376                         }
3377
3378                         if (chroot(".") < 0) {
3379                                 log_error("chroot() failed: %m");
3380                                 _exit(EXIT_FAILURE);
3381                         }
3382
3383                         if (chdir("/") < 0) {
3384                                 log_error("chdir() failed: %m");
3385                                 _exit(EXIT_FAILURE);
3386                         }
3387
3388                         umask(0022);
3389
3390                         if (arg_private_network)
3391                                 loopback_setup();
3392
3393                         if (drop_capabilities() < 0) {
3394                                 log_error("drop_capabilities() failed: %m");
3395                                 _exit(EXIT_FAILURE);
3396                         }
3397
3398                         r = change_uid_gid(&home);
3399                         if (r < 0)
3400                                 _exit(EXIT_FAILURE);
3401
3402                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3403                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3404                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3405                                 log_oom();
3406                                 _exit(EXIT_FAILURE);
3407                         }
3408
3409                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3410                                 char as_uuid[37];
3411
3412                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3413                                         log_oom();
3414                                         _exit(EXIT_FAILURE);
3415                                 }
3416                         }
3417
3418                         if (fdset_size(fds) > 0) {
3419                                 k = fdset_cloexec(fds, false);
3420                                 if (k < 0) {
3421                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3422                                         _exit(EXIT_FAILURE);
3423                                 }
3424
3425                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3426                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3427                                         log_oom();
3428                                         _exit(EXIT_FAILURE);
3429                                 }
3430                         }
3431
3432                         setup_hostname();
3433
3434                         if (arg_personality != 0xffffffffLU) {
3435                                 if (personality(arg_personality) < 0) {
3436                                         log_error("personality() failed: %m");
3437                                         _exit(EXIT_FAILURE);
3438                                 }
3439                         } else if (secondary) {
3440                                 if (personality(PER_LINUX32) < 0) {
3441                                         log_error("personality() failed: %m");
3442                                         _exit(EXIT_FAILURE);
3443                                 }
3444                         }
3445
3446 #ifdef HAVE_SELINUX
3447                         if (arg_selinux_context)
3448                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3449                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3450                                         _exit(EXIT_FAILURE);
3451                                 }
3452 #endif
3453
3454                         if (!strv_isempty(arg_setenv)) {
3455                                 char **n;
3456
3457                                 n = strv_env_merge(2, envp, arg_setenv);
3458                                 if (!n) {
3459                                         log_oom();
3460                                         _exit(EXIT_FAILURE);
3461                                 }
3462
3463                                 env_use = n;
3464                         } else
3465                                 env_use = (char**) envp;
3466
3467                         /* Wait until the parent is ready with the setup, too... */
3468                         if (!barrier_place_and_sync(&barrier))
3469                                 _exit(EXIT_FAILURE);
3470
3471                         if (arg_boot) {
3472                                 char **a;
3473                                 size_t l;
3474
3475                                 /* Automatically search for the init system */
3476
3477                                 l = 1 + argc - optind;
3478                                 a = newa(char*, l + 1);
3479                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3480
3481                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3482                                 execve(a[0], a, env_use);
3483
3484                                 a[0] = (char*) "/lib/systemd/systemd";
3485                                 execve(a[0], a, env_use);
3486
3487                                 a[0] = (char*) "/sbin/init";
3488                                 execve(a[0], a, env_use);
3489                         } else if (argc > optind)
3490                                 execvpe(argv[optind], argv + optind, env_use);
3491                         else {
3492                                 chdir(home ? home : "/root");
3493                                 execle("/bin/bash", "-bash", NULL, env_use);
3494                                 execle("/bin/sh", "-sh", NULL, env_use);
3495                         }
3496
3497                         log_error("execv() failed: %m");
3498                         _exit(EXIT_FAILURE);
3499                 }
3500
3501                 barrier_set_role(&barrier, BARRIER_PARENT);
3502                 fdset_free(fds);
3503                 fds = NULL;
3504
3505                 /* wait for child-setup to be done */
3506                 if (barrier_place_and_sync(&barrier)) {
3507                         int ifi = 0;
3508
3509                         r = move_network_interfaces(pid);
3510                         if (r < 0)
3511                                 goto finish;
3512
3513                         r = setup_veth(pid, veth_name, &ifi);
3514                         if (r < 0)
3515                                 goto finish;
3516
3517                         r = setup_bridge(veth_name, &ifi);
3518                         if (r < 0)
3519                                 goto finish;
3520
3521                         r = setup_macvlan(pid);
3522                         if (r < 0)
3523                                 goto finish;
3524
3525                         r = register_machine(pid, ifi);
3526                         if (r < 0)
3527                                 goto finish;
3528
3529                         /* Block SIGCHLD here, before notifying child.
3530                          * process_pty() will handle it with the other signals. */
3531                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3532                         if (r < 0)
3533                                 goto finish;
3534
3535                         /* Reset signal to default */
3536                         r = default_signals(SIGCHLD, -1);
3537                         if (r < 0)
3538                                 goto finish;
3539
3540                         /* Notify the child that the parent is ready with all
3541                          * its setup, and that the child can now hand over
3542                          * control to the code to run inside the container. */
3543                         barrier_place(&barrier);
3544
3545                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3546                         if (k < 0) {
3547                                 r = EXIT_FAILURE;
3548                                 break;
3549                         }
3550
3551                         if (!arg_quiet)
3552                                 putc('\n', stdout);
3553
3554                         /* Kill if it is not dead yet anyway */
3555                         terminate_machine(pid);
3556                 }
3557
3558                 /* Normally redundant, but better safe than sorry */
3559                 kill(pid, SIGKILL);
3560
3561                 r = wait_for_container(pid, &container_status);
3562                 pid = 0;
3563
3564                 if (r < 0) {
3565                         /* We failed to wait for the container, or the
3566                          * container exited abnormally */
3567                         r = EXIT_FAILURE;
3568                         break;
3569                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3570                         /* The container exited with a non-zero
3571                          * status, or with zero status and no reboot
3572                          * was requested. */
3573                         break;
3574
3575                 /* CONTAINER_REBOOTED, loop again */
3576
3577                 if (arg_keep_unit) {
3578                         /* Special handling if we are running as a
3579                          * service: instead of simply restarting the
3580                          * machine we want to restart the entire
3581                          * service, so let's inform systemd about this
3582                          * with the special exit code 133. The service
3583                          * file uses RestartForceExitStatus=133 so
3584                          * that this results in a full nspawn
3585                          * restart. This is necessary since we might
3586                          * have cgroup parameters set we want to have
3587                          * flushed out. */
3588                         r = 133;
3589                         break;
3590                 }
3591         }
3592
3593 finish:
3594         sd_notify(false,
3595                   "STOPPING=1\n"
3596                   "STATUS=Terminating...");
3597
3598         loop_remove(loop_nr, &image_fd);
3599
3600         if (pid > 0)
3601                 kill(pid, SIGKILL);
3602
3603         free(arg_directory);
3604         free(arg_machine);
3605         free(arg_user);
3606         strv_free(arg_setenv);
3607         strv_free(arg_network_interfaces);
3608         strv_free(arg_network_macvlan);
3609         strv_free(arg_bind);
3610         strv_free(arg_bind_ro);
3611         strv_free(arg_tmpfs);
3612
3613         return r;
3614 }