chiark / gitweb /
e1e1c367f0250437f3ce621e2d9bf5b9aabccc22
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94
95 #ifdef HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98
99 typedef enum ContainerStatus {
100         CONTAINER_TERMINATED,
101         CONTAINER_REBOOTED
102 } ContainerStatus;
103
104 typedef enum LinkJournal {
105         LINK_NO,
106         LINK_AUTO,
107         LINK_HOST,
108         LINK_GUEST
109 } LinkJournal;
110
111 typedef enum Volatile {
112         VOLATILE_NO,
113         VOLATILE_YES,
114         VOLATILE_STATE,
115 } Volatile;
116
117 static char *arg_directory = NULL;
118 static char *arg_user = NULL;
119 static sd_id128_t arg_uuid = {};
120 static char *arg_machine = NULL;
121 static const char *arg_selinux_context = NULL;
122 static const char *arg_selinux_apifs_context = NULL;
123 static const char *arg_slice = NULL;
124 static bool arg_private_network = false;
125 static bool arg_read_only = false;
126 static bool arg_boot = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130         (1ULL << CAP_CHOWN) |
131         (1ULL << CAP_DAC_OVERRIDE) |
132         (1ULL << CAP_DAC_READ_SEARCH) |
133         (1ULL << CAP_FOWNER) |
134         (1ULL << CAP_FSETID) |
135         (1ULL << CAP_IPC_OWNER) |
136         (1ULL << CAP_KILL) |
137         (1ULL << CAP_LEASE) |
138         (1ULL << CAP_LINUX_IMMUTABLE) |
139         (1ULL << CAP_NET_BIND_SERVICE) |
140         (1ULL << CAP_NET_BROADCAST) |
141         (1ULL << CAP_NET_RAW) |
142         (1ULL << CAP_SETGID) |
143         (1ULL << CAP_SETFCAP) |
144         (1ULL << CAP_SETPCAP) |
145         (1ULL << CAP_SETUID) |
146         (1ULL << CAP_SYS_ADMIN) |
147         (1ULL << CAP_SYS_CHROOT) |
148         (1ULL << CAP_SYS_NICE) |
149         (1ULL << CAP_SYS_PTRACE) |
150         (1ULL << CAP_SYS_TTY_CONFIG) |
151         (1ULL << CAP_SYS_RESOURCE) |
152         (1ULL << CAP_SYS_BOOT) |
153         (1ULL << CAP_AUDIT_WRITE) |
154         (1ULL << CAP_AUDIT_CONTROL) |
155         (1ULL << CAP_MKNOD);
156 static char **arg_bind = NULL;
157 static char **arg_bind_ro = NULL;
158 static char **arg_tmpfs = NULL;
159 static char **arg_setenv = NULL;
160 static bool arg_quiet = false;
161 static bool arg_share_system = false;
162 static bool arg_register = true;
163 static bool arg_keep_unit = false;
164 static char **arg_network_interfaces = NULL;
165 static char **arg_network_macvlan = NULL;
166 static bool arg_network_veth = false;
167 static const char *arg_network_bridge = NULL;
168 static unsigned long arg_personality = 0xffffffffLU;
169 static const char *arg_image = NULL;
170 static Volatile arg_volatile = VOLATILE_NO;
171
172 static void help(void) {
173         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175                "  -h --help                 Show this help\n"
176                "     --version              Print version string\n"
177                "  -q --quiet                Do not show status information\n"
178                "  -D --directory=PATH       Root directory for the container\n"
179                "  -i --image=PATH           File system device or image for the container\n"
180                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
181                "  -u --user=USER            Run the command under specified user or uid\n"
182                "  -M --machine=NAME         Set the machine name for the container\n"
183                "     --uuid=UUID            Set a specific machine UUID for the container\n"
184                "  -S --slice=SLICE          Place the container in the specified slice\n"
185                "     --private-network      Disable network in container\n"
186                "     --network-interface=INTERFACE\n"
187                "                            Assign an existing network interface to the\n"
188                "                            container\n"
189                "     --network-macvlan=INTERFACE\n"
190                "                            Create a macvlan network interface based on an\n"
191                "                            existing network interface to the container\n"
192                "     --network-veth         Add a virtual ethernet connection between host\n"
193                "                            and container\n"
194                "     --network-bridge=INTERFACE\n"
195                "                            Add a virtual ethernet connection between host\n"
196                "                            and container and add it to an existing bridge on\n"
197                "                            the host\n"
198                "  -Z --selinux-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            processes in the container\n"
201                "  -L --selinux-apifs-context=SECLABEL\n"
202                "                            Set the SELinux security context to be used by\n"
203                "                            API/tmpfs file systems in the container\n"
204                "     --capability=CAP       In addition to the default, retain specified\n"
205                "                            capability\n"
206                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
207                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
208                "                            try-guest, try-host\n"
209                "  -j                        Equivalent to --link-journal=try-guest\n"
210                "     --read-only            Mount the root directory read-only\n"
211                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
212                "                            the container\n"
213                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
214                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
215                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
216                "     --share-system         Share system namespaces with host\n"
217                "     --register=BOOLEAN     Register container as machine\n"
218                "     --keep-unit            Do not register a scope for the machine, reuse\n"
219                "                            the service unit nspawn is running in\n"
220                "     --volatile[=MODE]      Run the system in volatile mode\n",
221                program_invocation_short_name);
222 }
223
224 static int parse_argv(int argc, char *argv[]) {
225
226         enum {
227                 ARG_VERSION = 0x100,
228                 ARG_PRIVATE_NETWORK,
229                 ARG_UUID,
230                 ARG_READ_ONLY,
231                 ARG_CAPABILITY,
232                 ARG_DROP_CAPABILITY,
233                 ARG_LINK_JOURNAL,
234                 ARG_BIND,
235                 ARG_BIND_RO,
236                 ARG_TMPFS,
237                 ARG_SETENV,
238                 ARG_SHARE_SYSTEM,
239                 ARG_REGISTER,
240                 ARG_KEEP_UNIT,
241                 ARG_NETWORK_INTERFACE,
242                 ARG_NETWORK_MACVLAN,
243                 ARG_NETWORK_VETH,
244                 ARG_NETWORK_BRIDGE,
245                 ARG_PERSONALITY,
246                 ARG_VOLATILE,
247         };
248
249         static const struct option options[] = {
250                 { "help",                  no_argument,       NULL, 'h'                   },
251                 { "version",               no_argument,       NULL, ARG_VERSION           },
252                 { "directory",             required_argument, NULL, 'D'                   },
253                 { "user",                  required_argument, NULL, 'u'                   },
254                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
255                 { "boot",                  no_argument,       NULL, 'b'                   },
256                 { "uuid",                  required_argument, NULL, ARG_UUID              },
257                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
258                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
259                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
260                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
261                 { "bind",                  required_argument, NULL, ARG_BIND              },
262                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
263                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
264                 { "machine",               required_argument, NULL, 'M'                   },
265                 { "slice",                 required_argument, NULL, 'S'                   },
266                 { "setenv",                required_argument, NULL, ARG_SETENV            },
267                 { "selinux-context",       required_argument, NULL, 'Z'                   },
268                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
269                 { "quiet",                 no_argument,       NULL, 'q'                   },
270                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
271                 { "register",              required_argument, NULL, ARG_REGISTER          },
272                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
273                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
274                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
275                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
276                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
277                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
278                 { "image",                 required_argument, NULL, 'i'                   },
279                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
280                 {}
281         };
282
283         int c, r;
284         uint64_t plus = 0, minus = 0;
285
286         assert(argc >= 0);
287         assert(argv);
288
289         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
290
291                 switch (c) {
292
293                 case 'h':
294                         help();
295                         return 0;
296
297                 case ARG_VERSION:
298                         puts(PACKAGE_STRING);
299                         puts(SYSTEMD_FEATURES);
300                         return 0;
301
302                 case 'D':
303                         free(arg_directory);
304                         arg_directory = canonicalize_file_name(optarg);
305                         if (!arg_directory) {
306                                 log_error_errno(errno, "Invalid root directory: %m");
307                                 return -ENOMEM;
308                         }
309
310                         break;
311
312                 case 'i':
313                         arg_image = optarg;
314                         break;
315
316                 case 'u':
317                         free(arg_user);
318                         arg_user = strdup(optarg);
319                         if (!arg_user)
320                                 return log_oom();
321
322                         break;
323
324                 case ARG_NETWORK_BRIDGE:
325                         arg_network_bridge = optarg;
326
327                         /* fall through */
328
329                 case ARG_NETWORK_VETH:
330                         arg_network_veth = true;
331                         arg_private_network = true;
332                         break;
333
334                 case ARG_NETWORK_INTERFACE:
335                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
336                                 return log_oom();
337
338                         arg_private_network = true;
339                         break;
340
341                 case ARG_NETWORK_MACVLAN:
342                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
343                                 return log_oom();
344
345                         /* fall through */
346
347                 case ARG_PRIVATE_NETWORK:
348                         arg_private_network = true;
349                         break;
350
351                 case 'b':
352                         arg_boot = true;
353                         break;
354
355                 case ARG_UUID:
356                         r = sd_id128_from_string(optarg, &arg_uuid);
357                         if (r < 0) {
358                                 log_error("Invalid UUID: %s", optarg);
359                                 return r;
360                         }
361                         break;
362
363                 case 'S':
364                         arg_slice = optarg;
365                         break;
366
367                 case 'M':
368                         if (isempty(optarg)) {
369                                 free(arg_machine);
370                                 arg_machine = NULL;
371                         } else {
372                                 if (!machine_name_is_valid(optarg)) {
373                                         log_error("Invalid machine name: %s", optarg);
374                                         return -EINVAL;
375                                 }
376
377                                 r = free_and_strdup(&arg_machine, optarg);
378                                 if (r < 0)
379                                         return log_oom();
380
381                                 break;
382                         }
383
384                 case 'Z':
385                         arg_selinux_context = optarg;
386                         break;
387
388                 case 'L':
389                         arg_selinux_apifs_context = optarg;
390                         break;
391
392                 case ARG_READ_ONLY:
393                         arg_read_only = true;
394                         break;
395
396                 case ARG_CAPABILITY:
397                 case ARG_DROP_CAPABILITY: {
398                         const char *state, *word;
399                         size_t length;
400
401                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
402                                 _cleanup_free_ char *t;
403
404                                 t = strndup(word, length);
405                                 if (!t)
406                                         return log_oom();
407
408                                 if (streq(t, "all")) {
409                                         if (c == ARG_CAPABILITY)
410                                                 plus = (uint64_t) -1;
411                                         else
412                                                 minus = (uint64_t) -1;
413                                 } else {
414                                         int cap;
415
416                                         cap = capability_from_name(t);
417                                         if (cap < 0) {
418                                                 log_error("Failed to parse capability %s.", t);
419                                                 return -EINVAL;
420                                         }
421
422                                         if (c == ARG_CAPABILITY)
423                                                 plus |= 1ULL << (uint64_t) cap;
424                                         else
425                                                 minus |= 1ULL << (uint64_t) cap;
426                                 }
427                         }
428
429                         break;
430                 }
431
432                 case 'j':
433                         arg_link_journal = LINK_GUEST;
434                         arg_link_journal_try = true;
435                         break;
436
437                 case ARG_LINK_JOURNAL:
438                         if (streq(optarg, "auto"))
439                                 arg_link_journal = LINK_AUTO;
440                         else if (streq(optarg, "no"))
441                                 arg_link_journal = LINK_NO;
442                         else if (streq(optarg, "guest"))
443                                 arg_link_journal = LINK_GUEST;
444                         else if (streq(optarg, "host"))
445                                 arg_link_journal = LINK_HOST;
446                         else if (streq(optarg, "try-guest")) {
447                                 arg_link_journal = LINK_GUEST;
448                                 arg_link_journal_try = true;
449                         } else if (streq(optarg, "try-host")) {
450                                 arg_link_journal = LINK_HOST;
451                                 arg_link_journal_try = true;
452                         } else {
453                                 log_error("Failed to parse link journal mode %s", optarg);
454                                 return -EINVAL;
455                         }
456
457                         break;
458
459                 case ARG_BIND:
460                 case ARG_BIND_RO: {
461                         _cleanup_free_ char *a = NULL, *b = NULL;
462                         char *e;
463                         char ***x;
464
465                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
466
467                         e = strchr(optarg, ':');
468                         if (e) {
469                                 a = strndup(optarg, e - optarg);
470                                 b = strdup(e + 1);
471                         } else {
472                                 a = strdup(optarg);
473                                 b = strdup(optarg);
474                         }
475
476                         if (!a || !b)
477                                 return log_oom();
478
479                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
480                                 log_error("Invalid bind mount specification: %s", optarg);
481                                 return -EINVAL;
482                         }
483
484                         r = strv_extend(x, a);
485                         if (r < 0)
486                                 return log_oom();
487
488                         r = strv_extend(x, b);
489                         if (r < 0)
490                                 return log_oom();
491
492                         break;
493                 }
494
495                 case ARG_TMPFS: {
496                         _cleanup_free_ char *a = NULL, *b = NULL;
497                         char *e;
498
499                         e = strchr(optarg, ':');
500                         if (e) {
501                                 a = strndup(optarg, e - optarg);
502                                 b = strdup(e + 1);
503                         } else {
504                                 a = strdup(optarg);
505                                 b = strdup("mode=0755");
506                         }
507
508                         if (!a || !b)
509                                 return log_oom();
510
511                         if (!path_is_absolute(a)) {
512                                 log_error("Invalid tmpfs specification: %s", optarg);
513                                 return -EINVAL;
514                         }
515
516                         r = strv_push(&arg_tmpfs, a);
517                         if (r < 0)
518                                 return log_oom();
519
520                         a = NULL;
521
522                         r = strv_push(&arg_tmpfs, b);
523                         if (r < 0)
524                                 return log_oom();
525
526                         b = NULL;
527
528                         break;
529                 }
530
531                 case ARG_SETENV: {
532                         char **n;
533
534                         if (!env_assignment_is_valid(optarg)) {
535                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
536                                 return -EINVAL;
537                         }
538
539                         n = strv_env_set(arg_setenv, optarg);
540                         if (!n)
541                                 return log_oom();
542
543                         strv_free(arg_setenv);
544                         arg_setenv = n;
545                         break;
546                 }
547
548                 case 'q':
549                         arg_quiet = true;
550                         break;
551
552                 case ARG_SHARE_SYSTEM:
553                         arg_share_system = true;
554                         break;
555
556                 case ARG_REGISTER:
557                         r = parse_boolean(optarg);
558                         if (r < 0) {
559                                 log_error("Failed to parse --register= argument: %s", optarg);
560                                 return r;
561                         }
562
563                         arg_register = r;
564                         break;
565
566                 case ARG_KEEP_UNIT:
567                         arg_keep_unit = true;
568                         break;
569
570                 case ARG_PERSONALITY:
571
572                         arg_personality = personality_from_string(optarg);
573                         if (arg_personality == 0xffffffffLU) {
574                                 log_error("Unknown or unsupported personality '%s'.", optarg);
575                                 return -EINVAL;
576                         }
577
578                         break;
579
580                 case ARG_VOLATILE:
581
582                         if (!optarg)
583                                 arg_volatile = VOLATILE_YES;
584                         else {
585                                 r = parse_boolean(optarg);
586                                 if (r < 0) {
587                                         if (streq(optarg, "state"))
588                                                 arg_volatile = VOLATILE_STATE;
589                                         else {
590                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
591                                                 return r;
592                                         }
593                                 } else
594                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
595                         }
596
597                         break;
598
599                 case '?':
600                         return -EINVAL;
601
602                 default:
603                         assert_not_reached("Unhandled option");
604                 }
605
606         if (arg_share_system)
607                 arg_register = false;
608
609         if (arg_boot && arg_share_system) {
610                 log_error("--boot and --share-system may not be combined.");
611                 return -EINVAL;
612         }
613
614         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
615                 log_error("--keep-unit may not be used when invoked from a user session.");
616                 return -EINVAL;
617         }
618
619         if (arg_directory && arg_image) {
620                 log_error("--directory= and --image= may not be combined.");
621                 return -EINVAL;
622         }
623
624         if (arg_volatile != VOLATILE_NO && arg_read_only) {
625                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
626                 return -EINVAL;
627         }
628
629         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
630
631         return 1;
632 }
633
634 static int mount_all(const char *dest) {
635
636         typedef struct MountPoint {
637                 const char *what;
638                 const char *where;
639                 const char *type;
640                 const char *options;
641                 unsigned long flags;
642                 bool fatal;
643         } MountPoint;
644
645         static const MountPoint mount_table[] = {
646                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
647                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
648                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
649                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
650                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
651                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
652                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
653                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
654 #ifdef HAVE_SELINUX
655                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
656                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
657 #endif
658         };
659
660         unsigned k;
661         int r = 0;
662
663         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
664                 _cleanup_free_ char *where = NULL;
665 #ifdef HAVE_SELINUX
666                 _cleanup_free_ char *options = NULL;
667 #endif
668                 const char *o;
669                 int t;
670
671                 where = strjoin(dest, "/", mount_table[k].where, NULL);
672                 if (!where)
673                         return log_oom();
674
675                 t = path_is_mount_point(where, true);
676                 if (t < 0) {
677                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
678
679                         if (r == 0)
680                                 r = t;
681
682                         continue;
683                 }
684
685                 /* Skip this entry if it is not a remount. */
686                 if (mount_table[k].what && t > 0)
687                         continue;
688
689                 t = mkdir_p(where, 0755);
690                 if (t < 0) {
691                         if (mount_table[k].fatal) {
692                                log_error_errno(t, "Failed to create directory %s: %m", where);
693
694                                 if (r == 0)
695                                         r = t;
696                         } else
697                                log_warning_errno(t, "Failed to create directory %s: %m", where);
698
699                         continue;
700                 }
701
702 #ifdef HAVE_SELINUX
703                 if (arg_selinux_apifs_context &&
704                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
705                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
706                         if (!options)
707                                 return log_oom();
708
709                         o = options;
710                 } else
711 #endif
712                         o = mount_table[k].options;
713
714
715                 if (mount(mount_table[k].what,
716                           where,
717                           mount_table[k].type,
718                           mount_table[k].flags,
719                           o) < 0) {
720
721                         if (mount_table[k].fatal) {
722                                 log_error_errno(errno, "mount(%s) failed: %m", where);
723
724                                 if (r == 0)
725                                         r = -errno;
726                         } else
727                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
728                 }
729         }
730
731         return r;
732 }
733
734 static int mount_binds(const char *dest, char **l, bool ro) {
735         char **x, **y;
736
737         STRV_FOREACH_PAIR(x, y, l) {
738                 _cleanup_free_ char *where = NULL;
739                 struct stat source_st, dest_st;
740                 int r;
741
742                 if (stat(*x, &source_st) < 0)
743                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
744
745                 where = strappend(dest, *y);
746                 if (!where)
747                         return log_oom();
748
749                 r = stat(where, &dest_st);
750                 if (r == 0) {
751                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
752                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
753                                 return -EINVAL;
754                         }
755                 } else if (errno == ENOENT) {
756                         r = mkdir_parents_label(where, 0755);
757                         if (r < 0)
758                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
759                 } else {
760                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
761                         return -errno;
762                 }
763
764                 /* Create the mount point, but be conservative -- refuse to create block
765                  * and char devices. */
766                 if (S_ISDIR(source_st.st_mode)) {
767                         r = mkdir_label(where, 0755);
768                         if (r < 0 && errno != EEXIST)
769                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
770                 } else if (S_ISFIFO(source_st.st_mode)) {
771                         r = mkfifo(where, 0644);
772                         if (r < 0 && errno != EEXIST)
773                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
774                 } else if (S_ISSOCK(source_st.st_mode)) {
775                         r = mknod(where, 0644 | S_IFSOCK, 0);
776                         if (r < 0 && errno != EEXIST)
777                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
778                 } else if (S_ISREG(source_st.st_mode)) {
779                         r = touch(where);
780                         if (r < 0)
781                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
782                 } else {
783                         log_error("Refusing to create mountpoint for file: %s", *x);
784                         return -ENOTSUP;
785                 }
786
787                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
788                         return log_error_errno(errno, "mount(%s) failed: %m", where);
789
790                 if (ro) {
791                         r = bind_remount_recursive(where, true);
792                         if (r < 0)
793                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
794                 }
795         }
796
797         return 0;
798 }
799
800 static int mount_tmpfs(const char *dest) {
801         char **i, **o;
802
803         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
804                 _cleanup_free_ char *where = NULL;
805                 int r;
806
807                 where = strappend(dest, *i);
808                 if (!where)
809                         return log_oom();
810
811                 r = mkdir_label(where, 0755);
812                 if (r < 0 && r != -EEXIST)
813                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
814
815                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
816                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
817         }
818
819         return 0;
820 }
821
822 static int setup_timezone(const char *dest) {
823         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
824         char *z, *y;
825         int r;
826
827         assert(dest);
828
829         /* Fix the timezone, if possible */
830         r = readlink_malloc("/etc/localtime", &p);
831         if (r < 0) {
832                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
833                 return 0;
834         }
835
836         z = path_startswith(p, "../usr/share/zoneinfo/");
837         if (!z)
838                 z = path_startswith(p, "/usr/share/zoneinfo/");
839         if (!z) {
840                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
841                 return 0;
842         }
843
844         where = strappend(dest, "/etc/localtime");
845         if (!where)
846                 return log_oom();
847
848         r = readlink_malloc(where, &q);
849         if (r >= 0) {
850                 y = path_startswith(q, "../usr/share/zoneinfo/");
851                 if (!y)
852                         y = path_startswith(q, "/usr/share/zoneinfo/");
853
854                 /* Already pointing to the right place? Then do nothing .. */
855                 if (y && streq(y, z))
856                         return 0;
857         }
858
859         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
860         if (!check)
861                 return log_oom();
862
863         if (access(check, F_OK) < 0) {
864                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
865                 return 0;
866         }
867
868         what = strappend("../usr/share/zoneinfo/", z);
869         if (!what)
870                 return log_oom();
871
872         r = mkdir_parents(where, 0755);
873         if (r < 0) {
874                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
875
876                 return 0;
877         }
878
879         r = unlink(where);
880         if (r < 0 && errno != ENOENT) {
881                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
882
883                 return 0;
884         }
885
886         if (symlink(what, where) < 0) {
887                 log_error_errno(errno, "Failed to correct timezone of container: %m");
888                 return 0;
889         }
890
891         return 0;
892 }
893
894 static int setup_resolv_conf(const char *dest) {
895         _cleanup_free_ char *where = NULL;
896         int r;
897
898         assert(dest);
899
900         if (arg_private_network)
901                 return 0;
902
903         /* Fix resolv.conf, if possible */
904         where = strappend(dest, "/etc/resolv.conf");
905         if (!where)
906                 return log_oom();
907
908         /* We don't really care for the results of this really. If it
909          * fails, it fails, but meh... */
910         r = mkdir_parents(where, 0755);
911         if (r < 0) {
912                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
913
914                 return 0;
915         }
916
917         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
918         if (r < 0) {
919                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
920
921                 return 0;
922         }
923
924         return 0;
925 }
926
927 static int setup_volatile_state(const char *directory) {
928         const char *p;
929         int r;
930
931         assert(directory);
932
933         if (arg_volatile != VOLATILE_STATE)
934                 return 0;
935
936         /* --volatile=state means we simply overmount /var
937            with a tmpfs, and the rest read-only. */
938
939         r = bind_remount_recursive(directory, true);
940         if (r < 0)
941                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
942
943         p = strappenda(directory, "/var");
944         r = mkdir(p, 0755);
945         if (r < 0 && errno != EEXIST)
946                 return log_error_errno(errno, "Failed to create %s: %m", directory);
947
948         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
949                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
950
951         return 0;
952 }
953
954 static int setup_volatile(const char *directory) {
955         bool tmpfs_mounted = false, bind_mounted = false;
956         char template[] = "/tmp/nspawn-volatile-XXXXXX";
957         const char *f, *t;
958         int r;
959
960         assert(directory);
961
962         if (arg_volatile != VOLATILE_YES)
963                 return 0;
964
965         /* --volatile=yes means we mount a tmpfs to the root dir, and
966            the original /usr to use inside it, and that read-only. */
967
968         if (!mkdtemp(template))
969                 return log_error_errno(errno, "Failed to create temporary directory: %m");
970
971         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
972                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
973                 r = -errno;
974                 goto fail;
975         }
976
977         tmpfs_mounted = true;
978
979         f = strappenda(directory, "/usr");
980         t = strappenda(template, "/usr");
981
982         r = mkdir(t, 0755);
983         if (r < 0 && errno != EEXIST) {
984                 log_error_errno(errno, "Failed to create %s: %m", t);
985                 r = -errno;
986                 goto fail;
987         }
988
989         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
990                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
991                 r = -errno;
992                 goto fail;
993         }
994
995         bind_mounted = true;
996
997         r = bind_remount_recursive(t, true);
998         if (r < 0) {
999                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1000                 goto fail;
1001         }
1002
1003         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1004                 log_error_errno(errno, "Failed to move root mount: %m");
1005                 r = -errno;
1006                 goto fail;
1007         }
1008
1009         rmdir(template);
1010
1011         return 0;
1012
1013 fail:
1014         if (bind_mounted)
1015                 umount(t);
1016         if (tmpfs_mounted)
1017                 umount(template);
1018         rmdir(template);
1019         return r;
1020 }
1021
1022 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1023
1024         snprintf(s, 37,
1025                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1026                  SD_ID128_FORMAT_VAL(id));
1027
1028         return s;
1029 }
1030
1031 static int setup_boot_id(const char *dest) {
1032         _cleanup_free_ char *from = NULL, *to = NULL;
1033         sd_id128_t rnd = {};
1034         char as_uuid[37];
1035         int r;
1036
1037         assert(dest);
1038
1039         if (arg_share_system)
1040                 return 0;
1041
1042         /* Generate a new randomized boot ID, so that each boot-up of
1043          * the container gets a new one */
1044
1045         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1046         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1047         if (!from || !to)
1048                 return log_oom();
1049
1050         r = sd_id128_randomize(&rnd);
1051         if (r < 0)
1052                 return log_error_errno(r, "Failed to generate random boot id: %m");
1053
1054         id128_format_as_uuid(rnd, as_uuid);
1055
1056         r = write_string_file(from, as_uuid);
1057         if (r < 0)
1058                 return log_error_errno(r, "Failed to write boot id: %m");
1059
1060         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1061                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1062                 r = -errno;
1063         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1064                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1065
1066         unlink(from);
1067         return r;
1068 }
1069
1070 static int copy_devnodes(const char *dest) {
1071
1072         static const char devnodes[] =
1073                 "null\0"
1074                 "zero\0"
1075                 "full\0"
1076                 "random\0"
1077                 "urandom\0"
1078                 "tty\0"
1079                 "net/tun\0";
1080
1081         const char *d;
1082         int r = 0;
1083         _cleanup_umask_ mode_t u;
1084
1085         assert(dest);
1086
1087         u = umask(0000);
1088
1089         NULSTR_FOREACH(d, devnodes) {
1090                 _cleanup_free_ char *from = NULL, *to = NULL;
1091                 struct stat st;
1092
1093                 from = strappend("/dev/", d);
1094                 to = strjoin(dest, "/dev/", d, NULL);
1095                 if (!from || !to)
1096                         return log_oom();
1097
1098                 if (stat(from, &st) < 0) {
1099
1100                         if (errno != ENOENT)
1101                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1102
1103                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1104
1105                         log_error("%s is not a char or block device, cannot copy", from);
1106                         return -EIO;
1107
1108                 } else {
1109                         r = mkdir_parents(to, 0775);
1110                         if (r < 0) {
1111                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1112                                 return -r;
1113                         }
1114
1115                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1116                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1117                 }
1118         }
1119
1120         return r;
1121 }
1122
1123 static int setup_ptmx(const char *dest) {
1124         _cleanup_free_ char *p = NULL;
1125
1126         p = strappend(dest, "/dev/ptmx");
1127         if (!p)
1128                 return log_oom();
1129
1130         if (symlink("pts/ptmx", p) < 0)
1131                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1132
1133         return 0;
1134 }
1135
1136 static int setup_dev_console(const char *dest, const char *console) {
1137         _cleanup_umask_ mode_t u;
1138         const char *to;
1139         struct stat st;
1140         int r;
1141
1142         assert(dest);
1143         assert(console);
1144
1145         u = umask(0000);
1146
1147         if (stat("/dev/null", &st) < 0)
1148                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1149
1150         r = chmod_and_chown(console, 0600, 0, 0);
1151         if (r < 0)
1152                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1153
1154         /* We need to bind mount the right tty to /dev/console since
1155          * ptys can only exist on pts file systems. To have something
1156          * to bind mount things on we create a device node first, and
1157          * use /dev/null for that since we the cgroups device policy
1158          * allows us to create that freely, while we cannot create
1159          * /dev/console. (Note that the major minor doesn't actually
1160          * matter here, since we mount it over anyway). */
1161
1162         to = strappenda(dest, "/dev/console");
1163         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1164                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1165
1166         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1167                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1168
1169         return 0;
1170 }
1171
1172 static int setup_kmsg(const char *dest, int kmsg_socket) {
1173         _cleanup_free_ char *from = NULL, *to = NULL;
1174         int r, fd, k;
1175         _cleanup_umask_ mode_t u;
1176         union {
1177                 struct cmsghdr cmsghdr;
1178                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1179         } control = {};
1180         struct msghdr mh = {
1181                 .msg_control = &control,
1182                 .msg_controllen = sizeof(control),
1183         };
1184         struct cmsghdr *cmsg;
1185
1186         assert(dest);
1187         assert(kmsg_socket >= 0);
1188
1189         u = umask(0000);
1190
1191         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1192          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1193          * on the reading side behave very similar to /proc/kmsg,
1194          * their writing side behaves differently from /dev/kmsg in
1195          * that writing blocks when nothing is reading. In order to
1196          * avoid any problems with containers deadlocking due to this
1197          * we simply make /dev/kmsg unavailable to the container. */
1198         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1199             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1200                 return log_oom();
1201
1202         if (mkfifo(from, 0600) < 0)
1203                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1204
1205         r = chmod_and_chown(from, 0600, 0, 0);
1206         if (r < 0)
1207                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1208
1209         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1210                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1211
1212         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1213         if (fd < 0)
1214                 return log_error_errno(errno, "Failed to open fifo: %m");
1215
1216         cmsg = CMSG_FIRSTHDR(&mh);
1217         cmsg->cmsg_level = SOL_SOCKET;
1218         cmsg->cmsg_type = SCM_RIGHTS;
1219         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1220         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1221
1222         mh.msg_controllen = cmsg->cmsg_len;
1223
1224         /* Store away the fd in the socket, so that it stays open as
1225          * long as we run the child */
1226         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1227         safe_close(fd);
1228
1229         if (k < 0)
1230                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1231
1232         /* And now make the FIFO unavailable as /dev/kmsg... */
1233         unlink(from);
1234         return 0;
1235 }
1236
1237 static int setup_hostname(void) {
1238
1239         if (arg_share_system)
1240                 return 0;
1241
1242         if (sethostname_idempotent(arg_machine) < 0)
1243                 return -errno;
1244
1245         return 0;
1246 }
1247
1248 static int setup_journal(const char *directory) {
1249         sd_id128_t machine_id, this_id;
1250         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1251         char *id;
1252         int r;
1253
1254         p = strappend(directory, "/etc/machine-id");
1255         if (!p)
1256                 return log_oom();
1257
1258         r = read_one_line_file(p, &b);
1259         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1260                 return 0;
1261         else if (r < 0)
1262                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1263
1264         id = strstrip(b);
1265         if (isempty(id) && arg_link_journal == LINK_AUTO)
1266                 return 0;
1267
1268         /* Verify validity */
1269         r = sd_id128_from_string(id, &machine_id);
1270         if (r < 0)
1271                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1272
1273         r = sd_id128_get_machine(&this_id);
1274         if (r < 0)
1275                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1276
1277         if (sd_id128_equal(machine_id, this_id)) {
1278                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1279                          "Host and machine ids are equal (%s): refusing to link journals", id);
1280                 if (arg_link_journal == LINK_AUTO)
1281                         return 0;
1282                 return
1283                         -EEXIST;
1284         }
1285
1286         if (arg_link_journal == LINK_NO)
1287                 return 0;
1288
1289         free(p);
1290         p = strappend("/var/log/journal/", id);
1291         q = strjoin(directory, "/var/log/journal/", id, NULL);
1292         if (!p || !q)
1293                 return log_oom();
1294
1295         if (path_is_mount_point(p, false) > 0) {
1296                 if (arg_link_journal != LINK_AUTO) {
1297                         log_error("%s: already a mount point, refusing to use for journal", p);
1298                         return -EEXIST;
1299                 }
1300
1301                 return 0;
1302         }
1303
1304         if (path_is_mount_point(q, false) > 0) {
1305                 if (arg_link_journal != LINK_AUTO) {
1306                         log_error("%s: already a mount point, refusing to use for journal", q);
1307                         return -EEXIST;
1308                 }
1309
1310                 return 0;
1311         }
1312
1313         r = readlink_and_make_absolute(p, &d);
1314         if (r >= 0) {
1315                 if ((arg_link_journal == LINK_GUEST ||
1316                      arg_link_journal == LINK_AUTO) &&
1317                     path_equal(d, q)) {
1318
1319                         r = mkdir_p(q, 0755);
1320                         if (r < 0)
1321                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1322                         return 0;
1323                 }
1324
1325                 if (unlink(p) < 0)
1326                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1327         } else if (r == -EINVAL) {
1328
1329                 if (arg_link_journal == LINK_GUEST &&
1330                     rmdir(p) < 0) {
1331
1332                         if (errno == ENOTDIR) {
1333                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1334                                 return r;
1335                         } else {
1336                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1337                                 return -errno;
1338                         }
1339                 }
1340         } else if (r != -ENOENT) {
1341                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1342                 return r;
1343         }
1344
1345         if (arg_link_journal == LINK_GUEST) {
1346
1347                 if (symlink(q, p) < 0) {
1348                         if (arg_link_journal_try) {
1349                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1350                                 return 0;
1351                         } else {
1352                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1353                                 return -errno;
1354                         }
1355                 }
1356
1357                 r = mkdir_p(q, 0755);
1358                 if (r < 0)
1359                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1360                 return 0;
1361         }
1362
1363         if (arg_link_journal == LINK_HOST) {
1364                 /* don't create parents here -- if the host doesn't have
1365                  * permanent journal set up, don't force it here */
1366                 r = mkdir(p, 0755);
1367                 if (r < 0) {
1368                         if (arg_link_journal_try) {
1369                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1370                                 return 0;
1371                         } else {
1372                                 log_error_errno(errno, "Failed to create %s: %m", p);
1373                                 return r;
1374                         }
1375                 }
1376
1377         } else if (access(p, F_OK) < 0)
1378                 return 0;
1379
1380         if (dir_is_empty(q) == 0)
1381                 log_warning("%s is not empty, proceeding anyway.", q);
1382
1383         r = mkdir_p(q, 0755);
1384         if (r < 0) {
1385                 log_error_errno(errno, "Failed to create %s: %m", q);
1386                 return r;
1387         }
1388
1389         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1390                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1391
1392         return 0;
1393 }
1394
1395 static int drop_capabilities(void) {
1396         return capability_bounding_set_drop(~arg_retain, false);
1397 }
1398
1399 static int register_machine(pid_t pid, int local_ifindex) {
1400         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1401         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1402         int r;
1403
1404         if (!arg_register)
1405                 return 0;
1406
1407         r = sd_bus_default_system(&bus);
1408         if (r < 0)
1409                 return log_error_errno(r, "Failed to open system bus: %m");
1410
1411         if (arg_keep_unit) {
1412                 r = sd_bus_call_method(
1413                                 bus,
1414                                 "org.freedesktop.machine1",
1415                                 "/org/freedesktop/machine1",
1416                                 "org.freedesktop.machine1.Manager",
1417                                 "RegisterMachineWithNetwork",
1418                                 &error,
1419                                 NULL,
1420                                 "sayssusai",
1421                                 arg_machine,
1422                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1423                                 "nspawn",
1424                                 "container",
1425                                 (uint32_t) pid,
1426                                 strempty(arg_directory),
1427                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1428         } else {
1429                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1430
1431                 r = sd_bus_message_new_method_call(
1432                                 bus,
1433                                 &m,
1434                                 "org.freedesktop.machine1",
1435                                 "/org/freedesktop/machine1",
1436                                 "org.freedesktop.machine1.Manager",
1437                                 "CreateMachineWithNetwork");
1438                 if (r < 0)
1439                         return log_error_errno(r, "Failed to create message: %m");
1440
1441                 r = sd_bus_message_append(
1442                                 m,
1443                                 "sayssusai",
1444                                 arg_machine,
1445                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1446                                 "nspawn",
1447                                 "container",
1448                                 (uint32_t) pid,
1449                                 strempty(arg_directory),
1450                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1451                 if (r < 0)
1452                         return log_error_errno(r, "Failed to append message arguments: %m");
1453
1454                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1455                 if (r < 0)
1456                         return log_error_errno(r, "Failed to open container: %m");
1457
1458                 if (!isempty(arg_slice)) {
1459                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1460                         if (r < 0)
1461                                 return log_error_errno(r, "Failed to append slice: %m");
1462                 }
1463
1464                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1465                 if (r < 0)
1466                         return log_error_errno(r, "Failed to add device policy: %m");
1467
1468                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1469                                           /* Allow the container to
1470                                            * access and create the API
1471                                            * device nodes, so that
1472                                            * PrivateDevices= in the
1473                                            * container can work
1474                                            * fine */
1475                                           "/dev/null", "rwm",
1476                                           "/dev/zero", "rwm",
1477                                           "/dev/full", "rwm",
1478                                           "/dev/random", "rwm",
1479                                           "/dev/urandom", "rwm",
1480                                           "/dev/tty", "rwm",
1481                                           "/dev/net/tun", "rwm",
1482                                           /* Allow the container
1483                                            * access to ptys. However,
1484                                            * do not permit the
1485                                            * container to ever create
1486                                            * these device nodes. */
1487                                           "/dev/pts/ptmx", "rw",
1488                                           "char-pts", "rw");
1489                 if (r < 0)
1490                         return log_error_errno(r, "Failed to add device whitelist: %m");
1491
1492                 r = sd_bus_message_close_container(m);
1493                 if (r < 0)
1494                         return log_error_errno(r, "Failed to close container: %m");
1495
1496                 r = sd_bus_call(bus, m, 0, &error, NULL);
1497         }
1498
1499         if (r < 0) {
1500                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1501                 return r;
1502         }
1503
1504         return 0;
1505 }
1506
1507 static int terminate_machine(pid_t pid) {
1508         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1509         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1510         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1511         const char *path;
1512         int r;
1513
1514         if (!arg_register)
1515                 return 0;
1516
1517         r = sd_bus_default_system(&bus);
1518         if (r < 0)
1519                 return log_error_errno(r, "Failed to open system bus: %m");
1520
1521         r = sd_bus_call_method(
1522                         bus,
1523                         "org.freedesktop.machine1",
1524                         "/org/freedesktop/machine1",
1525                         "org.freedesktop.machine1.Manager",
1526                         "GetMachineByPID",
1527                         &error,
1528                         &reply,
1529                         "u",
1530                         (uint32_t) pid);
1531         if (r < 0) {
1532                 /* Note that the machine might already have been
1533                  * cleaned up automatically, hence don't consider it a
1534                  * failure if we cannot get the machine object. */
1535                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1536                 return 0;
1537         }
1538
1539         r = sd_bus_message_read(reply, "o", &path);
1540         if (r < 0)
1541                 return bus_log_parse_error(r);
1542
1543         r = sd_bus_call_method(
1544                         bus,
1545                         "org.freedesktop.machine1",
1546                         path,
1547                         "org.freedesktop.machine1.Machine",
1548                         "Terminate",
1549                         &error,
1550                         NULL,
1551                         NULL);
1552         if (r < 0) {
1553                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1554                 return 0;
1555         }
1556
1557         return 0;
1558 }
1559
1560 static int reset_audit_loginuid(void) {
1561         _cleanup_free_ char *p = NULL;
1562         int r;
1563
1564         if (arg_share_system)
1565                 return 0;
1566
1567         r = read_one_line_file("/proc/self/loginuid", &p);
1568         if (r == -ENOENT)
1569                 return 0;
1570         if (r < 0)
1571                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1572
1573         /* Already reset? */
1574         if (streq(p, "4294967295"))
1575                 return 0;
1576
1577         r = write_string_file("/proc/self/loginuid", "4294967295");
1578         if (r < 0) {
1579                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1580                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1581                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1582                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1583                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1584
1585                 sleep(5);
1586         }
1587
1588         return 0;
1589 }
1590
1591 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1592 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1593 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1594
1595 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1596         uint8_t result[8];
1597         size_t l, sz;
1598         uint8_t *v, *i;
1599         int r;
1600
1601         l = strlen(arg_machine);
1602         sz = sizeof(sd_id128_t) + l;
1603         if (idx > 0)
1604                 sz += sizeof(idx);
1605
1606         v = alloca(sz);
1607
1608         /* fetch some persistent data unique to the host */
1609         r = sd_id128_get_machine((sd_id128_t*) v);
1610         if (r < 0)
1611                 return r;
1612
1613         /* combine with some data unique (on this host) to this
1614          * container instance */
1615         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1616         if (idx > 0) {
1617                 idx = htole64(idx);
1618                 memcpy(i, &idx, sizeof(idx));
1619         }
1620
1621         /* Let's hash the host machine ID plus the container name. We
1622          * use a fixed, but originally randomly created hash key here. */
1623         siphash24(result, v, sz, hash_key.bytes);
1624
1625         assert_cc(ETH_ALEN <= sizeof(result));
1626         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1627
1628         /* see eth_random_addr in the kernel */
1629         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1630         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1631
1632         return 0;
1633 }
1634
1635 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1636         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1637         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1638         struct ether_addr mac_host, mac_container;
1639         int r, i;
1640
1641         if (!arg_private_network)
1642                 return 0;
1643
1644         if (!arg_network_veth)
1645                 return 0;
1646
1647         /* Use two different interface name prefixes depending whether
1648          * we are in bridge mode or not. */
1649         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1650                  arg_network_bridge ? "vb" : "ve", arg_machine);
1651
1652         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1653         if (r < 0)
1654                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1655
1656         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1657         if (r < 0)
1658                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1659
1660         r = sd_rtnl_open(&rtnl, 0);
1661         if (r < 0)
1662                 return log_error_errno(r, "Failed to connect to netlink: %m");
1663
1664         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1665         if (r < 0)
1666                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1667
1668         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1669         if (r < 0)
1670                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1671
1672         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1673         if (r < 0)
1674                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1675
1676         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1677         if (r < 0)
1678                 return log_error_errno(r, "Failed to open netlink container: %m");
1679
1680         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1681         if (r < 0)
1682                 return log_error_errno(r, "Failed to open netlink container: %m");
1683
1684         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1685         if (r < 0)
1686                 return log_error_errno(r, "Failed to open netlink container: %m");
1687
1688         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1689         if (r < 0)
1690                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1691
1692         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1693         if (r < 0)
1694                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1695
1696         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1697         if (r < 0)
1698                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1699
1700         r = sd_rtnl_message_close_container(m);
1701         if (r < 0)
1702                 return log_error_errno(r, "Failed to close netlink container: %m");
1703
1704         r = sd_rtnl_message_close_container(m);
1705         if (r < 0)
1706                 return log_error_errno(r, "Failed to close netlink container: %m");
1707
1708         r = sd_rtnl_message_close_container(m);
1709         if (r < 0)
1710                 return log_error_errno(r, "Failed to close netlink container: %m");
1711
1712         r = sd_rtnl_call(rtnl, m, 0, NULL);
1713         if (r < 0)
1714                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1715
1716         i = (int) if_nametoindex(iface_name);
1717         if (i <= 0)
1718                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1719
1720         *ifi = i;
1721
1722         return 0;
1723 }
1724
1725 static int setup_bridge(const char veth_name[], int *ifi) {
1726         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1727         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1728         int r, bridge;
1729
1730         if (!arg_private_network)
1731                 return 0;
1732
1733         if (!arg_network_veth)
1734                 return 0;
1735
1736         if (!arg_network_bridge)
1737                 return 0;
1738
1739         bridge = (int) if_nametoindex(arg_network_bridge);
1740         if (bridge <= 0)
1741                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1742
1743         *ifi = bridge;
1744
1745         r = sd_rtnl_open(&rtnl, 0);
1746         if (r < 0)
1747                 return log_error_errno(r, "Failed to connect to netlink: %m");
1748
1749         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1750         if (r < 0)
1751                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1752
1753         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1754         if (r < 0)
1755                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1756
1757         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1758         if (r < 0)
1759                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1760
1761         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1762         if (r < 0)
1763                 return log_error_errno(r, "Failed to add netlink master field: %m");
1764
1765         r = sd_rtnl_call(rtnl, m, 0, NULL);
1766         if (r < 0)
1767                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1768
1769         return 0;
1770 }
1771
1772 static int parse_interface(struct udev *udev, const char *name) {
1773         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1774         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1775         int ifi;
1776
1777         ifi = (int) if_nametoindex(name);
1778         if (ifi <= 0)
1779                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1780
1781         sprintf(ifi_str, "n%i", ifi);
1782         d = udev_device_new_from_device_id(udev, ifi_str);
1783         if (!d)
1784                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1785
1786         if (udev_device_get_is_initialized(d) <= 0) {
1787                 log_error("Network interface %s is not initialized yet.", name);
1788                 return -EBUSY;
1789         }
1790
1791         return ifi;
1792 }
1793
1794 static int move_network_interfaces(pid_t pid) {
1795         _cleanup_udev_unref_ struct udev *udev = NULL;
1796         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1797         char **i;
1798         int r;
1799
1800         if (!arg_private_network)
1801                 return 0;
1802
1803         if (strv_isempty(arg_network_interfaces))
1804                 return 0;
1805
1806         r = sd_rtnl_open(&rtnl, 0);
1807         if (r < 0)
1808                 return log_error_errno(r, "Failed to connect to netlink: %m");
1809
1810         udev = udev_new();
1811         if (!udev) {
1812                 log_error("Failed to connect to udev.");
1813                 return -ENOMEM;
1814         }
1815
1816         STRV_FOREACH(i, arg_network_interfaces) {
1817                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1818                 int ifi;
1819
1820                 ifi = parse_interface(udev, *i);
1821                 if (ifi < 0)
1822                         return ifi;
1823
1824                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1825                 if (r < 0)
1826                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1827
1828                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1829                 if (r < 0)
1830                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1831
1832                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1833                 if (r < 0)
1834                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1835         }
1836
1837         return 0;
1838 }
1839
1840 static int setup_macvlan(pid_t pid) {
1841         _cleanup_udev_unref_ struct udev *udev = NULL;
1842         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1843         unsigned idx = 0;
1844         char **i;
1845         int r;
1846
1847         if (!arg_private_network)
1848                 return 0;
1849
1850         if (strv_isempty(arg_network_macvlan))
1851                 return 0;
1852
1853         r = sd_rtnl_open(&rtnl, 0);
1854         if (r < 0)
1855                 return log_error_errno(r, "Failed to connect to netlink: %m");
1856
1857         udev = udev_new();
1858         if (!udev) {
1859                 log_error("Failed to connect to udev.");
1860                 return -ENOMEM;
1861         }
1862
1863         STRV_FOREACH(i, arg_network_macvlan) {
1864                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1865                 _cleanup_free_ char *n = NULL;
1866                 struct ether_addr mac;
1867                 int ifi;
1868
1869                 ifi = parse_interface(udev, *i);
1870                 if (ifi < 0)
1871                         return ifi;
1872
1873                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1874                 if (r < 0)
1875                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1876
1877                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1878                 if (r < 0)
1879                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1880
1881                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1882                 if (r < 0)
1883                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1884
1885                 n = strappend("mv-", *i);
1886                 if (!n)
1887                         return log_oom();
1888
1889                 strshorten(n, IFNAMSIZ-1);
1890
1891                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1892                 if (r < 0)
1893                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1894
1895                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1896                 if (r < 0)
1897                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1898
1899                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1900                 if (r < 0)
1901                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1902
1903                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1904                 if (r < 0)
1905                         return log_error_errno(r, "Failed to open netlink container: %m");
1906
1907                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1908                 if (r < 0)
1909                         return log_error_errno(r, "Failed to open netlink container: %m");
1910
1911                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1912                 if (r < 0)
1913                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1914
1915                 r = sd_rtnl_message_close_container(m);
1916                 if (r < 0)
1917                         return log_error_errno(r, "Failed to close netlink container: %m");
1918
1919                 r = sd_rtnl_message_close_container(m);
1920                 if (r < 0)
1921                         return log_error_errno(r, "Failed to close netlink container: %m");
1922
1923                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1924                 if (r < 0)
1925                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1926         }
1927
1928         return 0;
1929 }
1930
1931 static int setup_seccomp(void) {
1932
1933 #ifdef HAVE_SECCOMP
1934         static const int blacklist[] = {
1935                 SCMP_SYS(kexec_load),
1936                 SCMP_SYS(open_by_handle_at),
1937                 SCMP_SYS(init_module),
1938                 SCMP_SYS(finit_module),
1939                 SCMP_SYS(delete_module),
1940                 SCMP_SYS(iopl),
1941                 SCMP_SYS(ioperm),
1942                 SCMP_SYS(swapon),
1943                 SCMP_SYS(swapoff),
1944         };
1945
1946         scmp_filter_ctx seccomp;
1947         unsigned i;
1948         int r;
1949
1950         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1951         if (!seccomp)
1952                 return log_oom();
1953
1954         r = seccomp_add_secondary_archs(seccomp);
1955         if (r < 0) {
1956                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1957                 goto finish;
1958         }
1959
1960         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1961                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1962                 if (r == -EFAULT)
1963                         continue; /* unknown syscall */
1964                 if (r < 0) {
1965                         log_error_errno(r, "Failed to block syscall: %m");
1966                         goto finish;
1967                 }
1968         }
1969
1970         /*
1971            Audit is broken in containers, much of the userspace audit
1972            hookup will fail if running inside a container. We don't
1973            care and just turn off creation of audit sockets.
1974
1975            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1976            with EAFNOSUPPORT which audit userspace uses as indication
1977            that audit is disabled in the kernel.
1978          */
1979
1980         r = seccomp_rule_add(
1981                         seccomp,
1982                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1983                         SCMP_SYS(socket),
1984                         2,
1985                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1986                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1987         if (r < 0) {
1988                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1989                 goto finish;
1990         }
1991
1992         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1993         if (r < 0) {
1994                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1995                 goto finish;
1996         }
1997
1998         r = seccomp_load(seccomp);
1999         if (r < 0)
2000                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2001
2002 finish:
2003         seccomp_release(seccomp);
2004         return r;
2005 #else
2006         return 0;
2007 #endif
2008
2009 }
2010
2011 static int setup_image(char **device_path, int *loop_nr) {
2012         struct loop_info64 info = {
2013                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2014         };
2015         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2016         _cleanup_free_ char* loopdev = NULL;
2017         struct stat st;
2018         int r, nr;
2019
2020         assert(device_path);
2021         assert(loop_nr);
2022
2023         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2024         if (fd < 0)
2025                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2026
2027         if (fstat(fd, &st) < 0)
2028                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2029
2030         if (S_ISBLK(st.st_mode)) {
2031                 char *p;
2032
2033                 p = strdup(arg_image);
2034                 if (!p)
2035                         return log_oom();
2036
2037                 *device_path = p;
2038
2039                 *loop_nr = -1;
2040
2041                 r = fd;
2042                 fd = -1;
2043
2044                 return r;
2045         }
2046
2047         if (!S_ISREG(st.st_mode)) {
2048                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2049                 return -EINVAL;
2050         }
2051
2052         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2053         if (control < 0)
2054                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2055
2056         nr = ioctl(control, LOOP_CTL_GET_FREE);
2057         if (nr < 0)
2058                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2059
2060         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2061                 return log_oom();
2062
2063         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2064         if (loop < 0)
2065                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2066
2067         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2068                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2069
2070         if (arg_read_only)
2071                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2072
2073         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2074                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2075
2076         *device_path = loopdev;
2077         loopdev = NULL;
2078
2079         *loop_nr = nr;
2080
2081         r = loop;
2082         loop = -1;
2083
2084         return r;
2085 }
2086
2087 static int dissect_image(
2088                 int fd,
2089                 char **root_device, bool *root_device_rw,
2090                 char **home_device, bool *home_device_rw,
2091                 char **srv_device, bool *srv_device_rw,
2092                 bool *secondary) {
2093
2094 #ifdef HAVE_BLKID
2095         int home_nr = -1, srv_nr = -1;
2096 #ifdef GPT_ROOT_NATIVE
2097         int root_nr = -1;
2098 #endif
2099 #ifdef GPT_ROOT_SECONDARY
2100         int secondary_root_nr = -1;
2101 #endif
2102
2103         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2104         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2105         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2106         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2107         _cleanup_udev_unref_ struct udev *udev = NULL;
2108         struct udev_list_entry *first, *item;
2109         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2110         const char *pttype = NULL;
2111         blkid_partlist pl;
2112         struct stat st;
2113         int r;
2114
2115         assert(fd >= 0);
2116         assert(root_device);
2117         assert(home_device);
2118         assert(srv_device);
2119         assert(secondary);
2120
2121         b = blkid_new_probe();
2122         if (!b)
2123                 return log_oom();
2124
2125         errno = 0;
2126         r = blkid_probe_set_device(b, fd, 0, 0);
2127         if (r != 0) {
2128                 if (errno == 0)
2129                         return log_oom();
2130
2131                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2132                 return -errno;
2133         }
2134
2135         blkid_probe_enable_partitions(b, 1);
2136         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2137
2138         errno = 0;
2139         r = blkid_do_safeprobe(b);
2140         if (r == -2 || r == 1) {
2141                 log_error("Failed to identify any partition table on %s.\n"
2142                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2143                 return -EINVAL;
2144         } else if (r != 0) {
2145                 if (errno == 0)
2146                         errno = EIO;
2147                 log_error_errno(errno, "Failed to probe: %m");
2148                 return -errno;
2149         }
2150
2151         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2152         if (!streq_ptr(pttype, "gpt")) {
2153                 log_error("Image %s does not carry a GUID Partition Table.\n"
2154                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2155                 return -EINVAL;
2156         }
2157
2158         errno = 0;
2159         pl = blkid_probe_get_partitions(b);
2160         if (!pl) {
2161                 if (errno == 0)
2162                         return log_oom();
2163
2164                 log_error("Failed to list partitions of %s", arg_image);
2165                 return -errno;
2166         }
2167
2168         udev = udev_new();
2169         if (!udev)
2170                 return log_oom();
2171
2172         if (fstat(fd, &st) < 0)
2173                 return log_error_errno(errno, "Failed to stat block device: %m");
2174
2175         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2176         if (!d)
2177                 return log_oom();
2178
2179         e = udev_enumerate_new(udev);
2180         if (!e)
2181                 return log_oom();
2182
2183         r = udev_enumerate_add_match_parent(e, d);
2184         if (r < 0)
2185                 return log_oom();
2186
2187         r = udev_enumerate_scan_devices(e);
2188         if (r < 0)
2189                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2190
2191         first = udev_enumerate_get_list_entry(e);
2192         udev_list_entry_foreach(item, first) {
2193                 _cleanup_udev_device_unref_ struct udev_device *q;
2194                 const char *stype, *node;
2195                 unsigned long long flags;
2196                 sd_id128_t type_id;
2197                 blkid_partition pp;
2198                 dev_t qn;
2199                 int nr;
2200
2201                 errno = 0;
2202                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2203                 if (!q) {
2204                         if (!errno)
2205                                 errno = ENOMEM;
2206
2207                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2208                         return -errno;
2209                 }
2210
2211                 qn = udev_device_get_devnum(q);
2212                 if (major(qn) == 0)
2213                         continue;
2214
2215                 if (st.st_rdev == qn)
2216                         continue;
2217
2218                 node = udev_device_get_devnode(q);
2219                 if (!node)
2220                         continue;
2221
2222                 pp = blkid_partlist_devno_to_partition(pl, qn);
2223                 if (!pp)
2224                         continue;
2225
2226                 flags = blkid_partition_get_flags(pp);
2227                 if (flags & GPT_FLAG_NO_AUTO)
2228                         continue;
2229
2230                 nr = blkid_partition_get_partno(pp);
2231                 if (nr < 0)
2232                         continue;
2233
2234                 stype = blkid_partition_get_type_string(pp);
2235                 if (!stype)
2236                         continue;
2237
2238                 if (sd_id128_from_string(stype, &type_id) < 0)
2239                         continue;
2240
2241                 if (sd_id128_equal(type_id, GPT_HOME)) {
2242
2243                         if (home && nr >= home_nr)
2244                                 continue;
2245
2246                         home_nr = nr;
2247                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2248
2249                         free(home);
2250                         home = strdup(node);
2251                         if (!home)
2252                                 return log_oom();
2253                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2254
2255                         if (srv && nr >= srv_nr)
2256                                 continue;
2257
2258                         srv_nr = nr;
2259                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2260
2261                         free(srv);
2262                         srv = strdup(node);
2263                         if (!srv)
2264                                 return log_oom();
2265                 }
2266 #ifdef GPT_ROOT_NATIVE
2267                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2268
2269                         if (root && nr >= root_nr)
2270                                 continue;
2271
2272                         root_nr = nr;
2273                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2274
2275                         free(root);
2276                         root = strdup(node);
2277                         if (!root)
2278                                 return log_oom();
2279                 }
2280 #endif
2281 #ifdef GPT_ROOT_SECONDARY
2282                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2283
2284                         if (secondary_root && nr >= secondary_root_nr)
2285                                 continue;
2286
2287                         secondary_root_nr = nr;
2288                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2289
2290
2291                         free(secondary_root);
2292                         secondary_root = strdup(node);
2293                         if (!secondary_root)
2294                                 return log_oom();
2295                 }
2296 #endif
2297         }
2298
2299         if (!root && !secondary_root) {
2300                 log_error("Failed to identify root partition in disk image %s.\n"
2301                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2302                 return -EINVAL;
2303         }
2304
2305         if (root) {
2306                 *root_device = root;
2307                 root = NULL;
2308
2309                 *root_device_rw = root_rw;
2310                 *secondary = false;
2311         } else if (secondary_root) {
2312                 *root_device = secondary_root;
2313                 secondary_root = NULL;
2314
2315                 *root_device_rw = secondary_root_rw;
2316                 *secondary = true;
2317         }
2318
2319         if (home) {
2320                 *home_device = home;
2321                 home = NULL;
2322
2323                 *home_device_rw = home_rw;
2324         }
2325
2326         if (srv) {
2327                 *srv_device = srv;
2328                 srv = NULL;
2329
2330                 *srv_device_rw = srv_rw;
2331         }
2332
2333         return 0;
2334 #else
2335         log_error("--image= is not supported, compiled without blkid support.");
2336         return -ENOTSUP;
2337 #endif
2338 }
2339
2340 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2341 #ifdef HAVE_BLKID
2342         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2343         const char *fstype, *p;
2344         int r;
2345
2346         assert(what);
2347         assert(where);
2348
2349         if (arg_read_only)
2350                 rw = false;
2351
2352         if (directory)
2353                 p = strappenda(where, directory);
2354         else
2355                 p = where;
2356
2357         errno = 0;
2358         b = blkid_new_probe_from_filename(what);
2359         if (!b) {
2360                 if (errno == 0)
2361                         return log_oom();
2362                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2363                 return -errno;
2364         }
2365
2366         blkid_probe_enable_superblocks(b, 1);
2367         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2368
2369         errno = 0;
2370         r = blkid_do_safeprobe(b);
2371         if (r == -1 || r == 1) {
2372                 log_error("Cannot determine file system type of %s", what);
2373                 return -EINVAL;
2374         } else if (r != 0) {
2375                 if (errno == 0)
2376                         errno = EIO;
2377                 log_error_errno(errno, "Failed to probe %s: %m", what);
2378                 return -errno;
2379         }
2380
2381         errno = 0;
2382         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2383                 if (errno == 0)
2384                         errno = EINVAL;
2385                 log_error("Failed to determine file system type of %s", what);
2386                 return -errno;
2387         }
2388
2389         if (streq(fstype, "crypto_LUKS")) {
2390                 log_error("nspawn currently does not support LUKS disk images.");
2391                 return -ENOTSUP;
2392         }
2393
2394         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2395                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2396
2397         return 0;
2398 #else
2399         log_error("--image= is not supported, compiled without blkid support.");
2400         return -ENOTSUP;
2401 #endif
2402 }
2403
2404 static int mount_devices(
2405                 const char *where,
2406                 const char *root_device, bool root_device_rw,
2407                 const char *home_device, bool home_device_rw,
2408                 const char *srv_device, bool srv_device_rw) {
2409         int r;
2410
2411         assert(where);
2412
2413         if (root_device) {
2414                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2415                 if (r < 0)
2416                         return log_error_errno(r, "Failed to mount root directory: %m");
2417         }
2418
2419         if (home_device) {
2420                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2421                 if (r < 0)
2422                         return log_error_errno(r, "Failed to mount home directory: %m");
2423         }
2424
2425         if (srv_device) {
2426                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2427                 if (r < 0)
2428                         return log_error_errno(r, "Failed to mount server data directory: %m");
2429         }
2430
2431         return 0;
2432 }
2433
2434 static void loop_remove(int nr, int *image_fd) {
2435         _cleanup_close_ int control = -1;
2436         int r;
2437
2438         if (nr < 0)
2439                 return;
2440
2441         if (image_fd && *image_fd >= 0) {
2442                 r = ioctl(*image_fd, LOOP_CLR_FD);
2443                 if (r < 0)
2444                         log_warning_errno(errno, "Failed to close loop image: %m");
2445                 *image_fd = safe_close(*image_fd);
2446         }
2447
2448         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2449         if (control < 0) {
2450                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2451                 return;
2452         }
2453
2454         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2455         if (r < 0)
2456                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2457 }
2458
2459 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2460         int pipe_fds[2];
2461         pid_t pid;
2462
2463         assert(database);
2464         assert(key);
2465         assert(rpid);
2466
2467         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2468                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2469
2470         pid = fork();
2471         if (pid < 0)
2472                 return log_error_errno(errno, "Failed to fork getent child: %m");
2473         else if (pid == 0) {
2474                 int nullfd;
2475                 char *empty_env = NULL;
2476
2477                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2478                         _exit(EXIT_FAILURE);
2479
2480                 if (pipe_fds[0] > 2)
2481                         safe_close(pipe_fds[0]);
2482                 if (pipe_fds[1] > 2)
2483                         safe_close(pipe_fds[1]);
2484
2485                 nullfd = open("/dev/null", O_RDWR);
2486                 if (nullfd < 0)
2487                         _exit(EXIT_FAILURE);
2488
2489                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2490                         _exit(EXIT_FAILURE);
2491
2492                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2493                         _exit(EXIT_FAILURE);
2494
2495                 if (nullfd > 2)
2496                         safe_close(nullfd);
2497
2498                 reset_all_signal_handlers();
2499                 close_all_fds(NULL, 0);
2500
2501                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2502                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2503                 _exit(EXIT_FAILURE);
2504         }
2505
2506         pipe_fds[1] = safe_close(pipe_fds[1]);
2507
2508         *rpid = pid;
2509
2510         return pipe_fds[0];
2511 }
2512
2513 static int change_uid_gid(char **_home) {
2514         char line[LINE_MAX], *x, *u, *g, *h;
2515         const char *word, *state;
2516         _cleanup_free_ uid_t *uids = NULL;
2517         _cleanup_free_ char *home = NULL;
2518         _cleanup_fclose_ FILE *f = NULL;
2519         _cleanup_close_ int fd = -1;
2520         unsigned n_uids = 0;
2521         size_t sz = 0, l;
2522         uid_t uid;
2523         gid_t gid;
2524         pid_t pid;
2525         int r;
2526
2527         assert(_home);
2528
2529         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2530                 /* Reset everything fully to 0, just in case */
2531
2532                 if (setgroups(0, NULL) < 0)
2533                         return log_error_errno(errno, "setgroups() failed: %m");
2534
2535                 if (setresgid(0, 0, 0) < 0)
2536                         return log_error_errno(errno, "setregid() failed: %m");
2537
2538                 if (setresuid(0, 0, 0) < 0)
2539                         return log_error_errno(errno, "setreuid() failed: %m");
2540
2541                 *_home = NULL;
2542                 return 0;
2543         }
2544
2545         /* First, get user credentials */
2546         fd = spawn_getent("passwd", arg_user, &pid);
2547         if (fd < 0)
2548                 return fd;
2549
2550         f = fdopen(fd, "r");
2551         if (!f)
2552                 return log_oom();
2553         fd = -1;
2554
2555         if (!fgets(line, sizeof(line), f)) {
2556
2557                 if (!ferror(f)) {
2558                         log_error("Failed to resolve user %s.", arg_user);
2559                         return -ESRCH;
2560                 }
2561
2562                 log_error_errno(errno, "Failed to read from getent: %m");
2563                 return -errno;
2564         }
2565
2566         truncate_nl(line);
2567
2568         wait_for_terminate_and_warn("getent passwd", pid, true);
2569
2570         x = strchr(line, ':');
2571         if (!x) {
2572                 log_error("/etc/passwd entry has invalid user field.");
2573                 return -EIO;
2574         }
2575
2576         u = strchr(x+1, ':');
2577         if (!u) {
2578                 log_error("/etc/passwd entry has invalid password field.");
2579                 return -EIO;
2580         }
2581
2582         u++;
2583         g = strchr(u, ':');
2584         if (!g) {
2585                 log_error("/etc/passwd entry has invalid UID field.");
2586                 return -EIO;
2587         }
2588
2589         *g = 0;
2590         g++;
2591         x = strchr(g, ':');
2592         if (!x) {
2593                 log_error("/etc/passwd entry has invalid GID field.");
2594                 return -EIO;
2595         }
2596
2597         *x = 0;
2598         h = strchr(x+1, ':');
2599         if (!h) {
2600                 log_error("/etc/passwd entry has invalid GECOS field.");
2601                 return -EIO;
2602         }
2603
2604         h++;
2605         x = strchr(h, ':');
2606         if (!x) {
2607                 log_error("/etc/passwd entry has invalid home directory field.");
2608                 return -EIO;
2609         }
2610
2611         *x = 0;
2612
2613         r = parse_uid(u, &uid);
2614         if (r < 0) {
2615                 log_error("Failed to parse UID of user.");
2616                 return -EIO;
2617         }
2618
2619         r = parse_gid(g, &gid);
2620         if (r < 0) {
2621                 log_error("Failed to parse GID of user.");
2622                 return -EIO;
2623         }
2624
2625         home = strdup(h);
2626         if (!home)
2627                 return log_oom();
2628
2629         /* Second, get group memberships */
2630         fd = spawn_getent("initgroups", arg_user, &pid);
2631         if (fd < 0)
2632                 return fd;
2633
2634         fclose(f);
2635         f = fdopen(fd, "r");
2636         if (!f)
2637                 return log_oom();
2638         fd = -1;
2639
2640         if (!fgets(line, sizeof(line), f)) {
2641                 if (!ferror(f)) {
2642                         log_error("Failed to resolve user %s.", arg_user);
2643                         return -ESRCH;
2644                 }
2645
2646                 log_error_errno(errno, "Failed to read from getent: %m");
2647                 return -errno;
2648         }
2649
2650         truncate_nl(line);
2651
2652         wait_for_terminate_and_warn("getent initgroups", pid, true);
2653
2654         /* Skip over the username and subsequent separator whitespace */
2655         x = line;
2656         x += strcspn(x, WHITESPACE);
2657         x += strspn(x, WHITESPACE);
2658
2659         FOREACH_WORD(word, l, x, state) {
2660                 char c[l+1];
2661
2662                 memcpy(c, word, l);
2663                 c[l] = 0;
2664
2665                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2666                         return log_oom();
2667
2668                 r = parse_uid(c, &uids[n_uids++]);
2669                 if (r < 0) {
2670                         log_error("Failed to parse group data from getent.");
2671                         return -EIO;
2672                 }
2673         }
2674
2675         r = mkdir_parents(home, 0775);
2676         if (r < 0)
2677                 return log_error_errno(r, "Failed to make home root directory: %m");
2678
2679         r = mkdir_safe(home, 0755, uid, gid);
2680         if (r < 0 && r != -EEXIST)
2681                 return log_error_errno(r, "Failed to make home directory: %m");
2682
2683         fchown(STDIN_FILENO, uid, gid);
2684         fchown(STDOUT_FILENO, uid, gid);
2685         fchown(STDERR_FILENO, uid, gid);
2686
2687         if (setgroups(n_uids, uids) < 0)
2688                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2689
2690         if (setresgid(gid, gid, gid) < 0)
2691                 return log_error_errno(errno, "setregid() failed: %m");
2692
2693         if (setresuid(uid, uid, uid) < 0)
2694                 return log_error_errno(errno, "setreuid() failed: %m");
2695
2696         if (_home) {
2697                 *_home = home;
2698                 home = NULL;
2699         }
2700
2701         return 0;
2702 }
2703
2704 /*
2705  * Return values:
2706  * < 0 : wait_for_terminate() failed to get the state of the
2707  *       container, the container was terminated by a signal, or
2708  *       failed for an unknown reason.  No change is made to the
2709  *       container argument.
2710  * > 0 : The program executed in the container terminated with an
2711  *       error.  The exit code of the program executed in the
2712  *       container is returned.  The container argument has been set
2713  *       to CONTAINER_TERMINATED.
2714  *   0 : The container is being rebooted, has been shut down or exited
2715  *       successfully.  The container argument has been set to either
2716  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2717  *
2718  * That is, success is indicated by a return value of zero, and an
2719  * error is indicated by a non-zero value.
2720  */
2721 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2722         siginfo_t status;
2723         int r;
2724
2725         r = wait_for_terminate(pid, &status);
2726         if (r < 0)
2727                 return log_warning_errno(r, "Failed to wait for container: %m");
2728
2729         switch (status.si_code) {
2730
2731         case CLD_EXITED:
2732                 if (status.si_status == 0) {
2733                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2734
2735                 } else
2736                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2737
2738                 *container = CONTAINER_TERMINATED;
2739                 return status.si_status;
2740
2741         case CLD_KILLED:
2742                 if (status.si_status == SIGINT) {
2743
2744                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2745                         *container = CONTAINER_TERMINATED;
2746                         return 0;
2747
2748                 } else if (status.si_status == SIGHUP) {
2749
2750                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2751                         *container = CONTAINER_REBOOTED;
2752                         return 0;
2753                 }
2754
2755                 /* CLD_KILLED fallthrough */
2756
2757         case CLD_DUMPED:
2758                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2759                 return -EIO;
2760
2761         default:
2762                 log_error("Container %s failed due to unknown reason.", arg_machine);
2763                 return -EIO;
2764         }
2765
2766         return r;
2767 }
2768
2769 static void nop_handler(int sig) {}
2770
2771 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2772         pid_t pid;
2773
2774         pid = PTR_TO_UINT32(userdata);
2775         if (pid > 0) {
2776                 if (kill(pid, SIGRTMIN+3) >= 0) {
2777                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2778                         sd_event_source_set_userdata(s, NULL);
2779                         return 0;
2780                 }
2781         }
2782
2783         sd_event_exit(sd_event_source_get_event(s), 0);
2784         return 0;
2785 }
2786
2787 int main(int argc, char *argv[]) {
2788
2789         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2790         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2791         _cleanup_close_ int master = -1, image_fd = -1;
2792         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2793         _cleanup_fdset_free_ FDSet *fds = NULL;
2794         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2795         const char *console = NULL;
2796         char veth_name[IFNAMSIZ];
2797         bool secondary = false;
2798         sigset_t mask, mask_chld;
2799         pid_t pid = 0;
2800
2801         log_parse_environment();
2802         log_open();
2803
2804         k = parse_argv(argc, argv);
2805         if (k < 0)
2806                 goto finish;
2807         else if (k == 0) {
2808                 r = EXIT_SUCCESS;
2809                 goto finish;
2810         }
2811
2812         if (!arg_image) {
2813                 if (arg_directory) {
2814                         char *p;
2815
2816                         p = path_make_absolute_cwd(arg_directory);
2817                         free(arg_directory);
2818                         arg_directory = p;
2819                 } else
2820                         arg_directory = get_current_dir_name();
2821
2822                 if (!arg_directory) {
2823                         log_error("Failed to determine path, please use -D.");
2824                         goto finish;
2825                 }
2826                 path_kill_slashes(arg_directory);
2827         }
2828
2829         if (!arg_machine) {
2830                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2831                 if (!arg_machine) {
2832                         log_oom();
2833                         goto finish;
2834                 }
2835
2836                 hostname_cleanup(arg_machine, false);
2837                 if (isempty(arg_machine)) {
2838                         log_error("Failed to determine machine name automatically, please use -M.");
2839                         goto finish;
2840                 }
2841         }
2842
2843         if (geteuid() != 0) {
2844                 log_error("Need to be root.");
2845                 goto finish;
2846         }
2847
2848         if (sd_booted() <= 0) {
2849                 log_error("Not running on a systemd system.");
2850                 goto finish;
2851         }
2852
2853         log_close();
2854         n_fd_passed = sd_listen_fds(false);
2855         if (n_fd_passed > 0) {
2856                 k = fdset_new_listen_fds(&fds, false);
2857                 if (k < 0) {
2858                         log_error_errno(k, "Failed to collect file descriptors: %m");
2859                         goto finish;
2860                 }
2861         }
2862         fdset_close_others(fds);
2863         log_open();
2864
2865         if (arg_directory) {
2866                 if (path_equal(arg_directory, "/")) {
2867                         log_error("Spawning container on root directory not supported.");
2868                         goto finish;
2869                 }
2870
2871                 if (arg_boot) {
2872                         if (path_is_os_tree(arg_directory) <= 0) {
2873                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2874                                 goto finish;
2875                         }
2876                 } else {
2877                         const char *p;
2878
2879                         p = strappenda(arg_directory,
2880                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2881                         if (access(p, F_OK) < 0) {
2882                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2883                                 goto finish;
2884
2885                         }
2886                 }
2887         } else {
2888                 char template[] = "/tmp/nspawn-root-XXXXXX";
2889
2890                 if (!mkdtemp(template)) {
2891                         log_error_errno(errno, "Failed to create temporary directory: %m");
2892                         r = -errno;
2893                         goto finish;
2894                 }
2895
2896                 arg_directory = strdup(template);
2897                 if (!arg_directory) {
2898                         r = log_oom();
2899                         goto finish;
2900                 }
2901
2902                 image_fd = setup_image(&device_path, &loop_nr);
2903                 if (image_fd < 0) {
2904                         r = image_fd;
2905                         goto finish;
2906                 }
2907
2908                 r = dissect_image(image_fd,
2909                                   &root_device, &root_device_rw,
2910                                   &home_device, &home_device_rw,
2911                                   &srv_device, &srv_device_rw,
2912                                   &secondary);
2913                 if (r < 0)
2914                         goto finish;
2915         }
2916
2917         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2918         if (master < 0) {
2919                 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
2920                 goto finish;
2921         }
2922
2923         console = ptsname(master);
2924         if (!console) {
2925                 log_error_errno(errno, "Failed to determine tty name: %m");
2926                 goto finish;
2927         }
2928
2929         if (!arg_quiet)
2930                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2931                          arg_machine, arg_image ? arg_image : arg_directory);
2932
2933         if (unlockpt(master) < 0) {
2934                 log_error_errno(errno, "Failed to unlock tty: %m");
2935                 goto finish;
2936         }
2937
2938         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2939                 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
2940                 goto finish;
2941         }
2942
2943         sd_notify(false,
2944                   "READY=1\n"
2945                   "STATUS=Container running.");
2946
2947         assert_se(sigemptyset(&mask) == 0);
2948         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2949         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2950
2951         assert_se(sigemptyset(&mask_chld) == 0);
2952         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2953
2954         for (;;) {
2955                 ContainerStatus container_status;
2956                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
2957                 struct sigaction sa = {
2958                         .sa_handler = nop_handler,
2959                         .sa_flags = SA_NOCLDSTOP,
2960                 };
2961
2962                 r = barrier_create(&barrier);
2963                 if (r < 0) {
2964                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
2965                         goto finish;
2966                 }
2967
2968                 /* Child can be killed before execv(), so handle SIGCHLD
2969                  * in order to interrupt parent's blocking calls and
2970                  * give it a chance to call wait() and terminate. */
2971                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2972                 if (r < 0) {
2973                         log_error_errno(errno, "Failed to change the signal mask: %m");
2974                         goto finish;
2975                 }
2976
2977                 r = sigaction(SIGCHLD, &sa, NULL);
2978                 if (r < 0) {
2979                         log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
2980                         goto finish;
2981                 }
2982
2983                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2984                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2985                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
2986                 if (pid < 0) {
2987                         if (errno == EINVAL)
2988                                 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2989                         else
2990                                 log_error_errno(errno, "clone() failed: %m");
2991
2992                         r = pid;
2993                         goto finish;
2994                 }
2995
2996                 if (pid == 0) {
2997                         /* child */
2998                         _cleanup_free_ char *home = NULL;
2999                         unsigned n_env = 2;
3000                         const char *envp[] = {
3001                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3002                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3003                                 NULL, /* TERM */
3004                                 NULL, /* HOME */
3005                                 NULL, /* USER */
3006                                 NULL, /* LOGNAME */
3007                                 NULL, /* container_uuid */
3008                                 NULL, /* LISTEN_FDS */
3009                                 NULL, /* LISTEN_PID */
3010                                 NULL
3011                         };
3012                         char **env_use;
3013
3014                         barrier_set_role(&barrier, BARRIER_CHILD);
3015
3016                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3017                         if (envp[n_env])
3018                                 n_env ++;
3019
3020                         master = safe_close(master);
3021
3022                         close_nointr(STDIN_FILENO);
3023                         close_nointr(STDOUT_FILENO);
3024                         close_nointr(STDERR_FILENO);
3025
3026                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3027
3028                         reset_all_signal_handlers();
3029                         reset_signal_mask();
3030
3031                         k = open_terminal(console, O_RDWR);
3032                         if (k != STDIN_FILENO) {
3033                                 if (k >= 0) {
3034                                         safe_close(k);
3035                                         k = -EINVAL;
3036                                 }
3037
3038                                 log_error_errno(k, "Failed to open console: %m");
3039                                 _exit(EXIT_FAILURE);
3040                         }
3041
3042                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3043                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3044                                 log_error_errno(errno, "Failed to duplicate console: %m");
3045                                 _exit(EXIT_FAILURE);
3046                         }
3047
3048                         if (setsid() < 0) {
3049                                 log_error_errno(errno, "setsid() failed: %m");
3050                                 _exit(EXIT_FAILURE);
3051                         }
3052
3053                         if (reset_audit_loginuid() < 0)
3054                                 _exit(EXIT_FAILURE);
3055
3056                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3057                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3058                                 _exit(EXIT_FAILURE);
3059                         }
3060
3061                         /* Mark everything as slave, so that we still
3062                          * receive mounts from the real root, but don't
3063                          * propagate mounts to the real root. */
3064                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3065                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3066                                 _exit(EXIT_FAILURE);
3067                         }
3068
3069                         if (mount_devices(arg_directory,
3070                                           root_device, root_device_rw,
3071                                           home_device, home_device_rw,
3072                                           srv_device, srv_device_rw) < 0)
3073                                 _exit(EXIT_FAILURE);
3074
3075                         /* Turn directory into bind mount */
3076                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3077                                 log_error_errno(errno, "Failed to make bind mount: %m");
3078                                 _exit(EXIT_FAILURE);
3079                         }
3080
3081                         r = setup_volatile(arg_directory);
3082                         if (r < 0)
3083                                 _exit(EXIT_FAILURE);
3084
3085                         if (setup_volatile_state(arg_directory) < 0)
3086                                 _exit(EXIT_FAILURE);
3087
3088                         r = base_filesystem_create(arg_directory);
3089                         if (r < 0)
3090                                 _exit(EXIT_FAILURE);
3091
3092                         if (arg_read_only) {
3093                                 k = bind_remount_recursive(arg_directory, true);
3094                                 if (k < 0) {
3095                                         log_error_errno(k, "Failed to make tree read-only: %m");
3096                                         _exit(EXIT_FAILURE);
3097                                 }
3098                         }
3099
3100                         if (mount_all(arg_directory) < 0)
3101                                 _exit(EXIT_FAILURE);
3102
3103                         if (copy_devnodes(arg_directory) < 0)
3104                                 _exit(EXIT_FAILURE);
3105
3106                         if (setup_ptmx(arg_directory) < 0)
3107                                 _exit(EXIT_FAILURE);
3108
3109                         dev_setup(arg_directory);
3110
3111                         if (setup_seccomp() < 0)
3112                                 _exit(EXIT_FAILURE);
3113
3114                         if (setup_dev_console(arg_directory, console) < 0)
3115                                 _exit(EXIT_FAILURE);
3116
3117                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3118                                 _exit(EXIT_FAILURE);
3119
3120                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3121
3122                         if (setup_boot_id(arg_directory) < 0)
3123                                 _exit(EXIT_FAILURE);
3124
3125                         if (setup_timezone(arg_directory) < 0)
3126                                 _exit(EXIT_FAILURE);
3127
3128                         if (setup_resolv_conf(arg_directory) < 0)
3129                                 _exit(EXIT_FAILURE);
3130
3131                         if (setup_journal(arg_directory) < 0)
3132                                 _exit(EXIT_FAILURE);
3133
3134                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3135                                 _exit(EXIT_FAILURE);
3136
3137                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3138                                 _exit(EXIT_FAILURE);
3139
3140                         if (mount_tmpfs(arg_directory) < 0)
3141                                 _exit(EXIT_FAILURE);
3142
3143                         /* Tell the parent that we are ready, and that
3144                          * it can cgroupify us to that we lack access
3145                          * to certain devices and resources. */
3146                         (void)barrier_place(&barrier);
3147
3148                         if (chdir(arg_directory) < 0) {
3149                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3150                                 _exit(EXIT_FAILURE);
3151                         }
3152
3153                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3154                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3155                                 _exit(EXIT_FAILURE);
3156                         }
3157
3158                         if (chroot(".") < 0) {
3159                                 log_error_errno(errno, "chroot() failed: %m");
3160                                 _exit(EXIT_FAILURE);
3161                         }
3162
3163                         if (chdir("/") < 0) {
3164                                 log_error_errno(errno, "chdir() failed: %m");
3165                                 _exit(EXIT_FAILURE);
3166                         }
3167
3168                         umask(0022);
3169
3170                         if (arg_private_network)
3171                                 loopback_setup();
3172
3173                         if (drop_capabilities() < 0) {
3174                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3175                                 _exit(EXIT_FAILURE);
3176                         }
3177
3178                         r = change_uid_gid(&home);
3179                         if (r < 0)
3180                                 _exit(EXIT_FAILURE);
3181
3182                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3183                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3184                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3185                                 log_oom();
3186                                 _exit(EXIT_FAILURE);
3187                         }
3188
3189                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3190                                 char as_uuid[37];
3191
3192                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3193                                         log_oom();
3194                                         _exit(EXIT_FAILURE);
3195                                 }
3196                         }
3197
3198                         if (fdset_size(fds) > 0) {
3199                                 k = fdset_cloexec(fds, false);
3200                                 if (k < 0) {
3201                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3202                                         _exit(EXIT_FAILURE);
3203                                 }
3204
3205                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3206                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3207                                         log_oom();
3208                                         _exit(EXIT_FAILURE);
3209                                 }
3210                         }
3211
3212                         setup_hostname();
3213
3214                         if (arg_personality != 0xffffffffLU) {
3215                                 if (personality(arg_personality) < 0) {
3216                                         log_error_errno(errno, "personality() failed: %m");
3217                                         _exit(EXIT_FAILURE);
3218                                 }
3219                         } else if (secondary) {
3220                                 if (personality(PER_LINUX32) < 0) {
3221                                         log_error_errno(errno, "personality() failed: %m");
3222                                         _exit(EXIT_FAILURE);
3223                                 }
3224                         }
3225
3226 #ifdef HAVE_SELINUX
3227                         if (arg_selinux_context)
3228                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3229                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3230                                         _exit(EXIT_FAILURE);
3231                                 }
3232 #endif
3233
3234                         if (!strv_isempty(arg_setenv)) {
3235                                 char **n;
3236
3237                                 n = strv_env_merge(2, envp, arg_setenv);
3238                                 if (!n) {
3239                                         log_oom();
3240                                         _exit(EXIT_FAILURE);
3241                                 }
3242
3243                                 env_use = n;
3244                         } else
3245                                 env_use = (char**) envp;
3246
3247                         /* Wait until the parent is ready with the setup, too... */
3248                         if (!barrier_place_and_sync(&barrier))
3249                                 _exit(EXIT_FAILURE);
3250
3251                         if (arg_boot) {
3252                                 char **a;
3253                                 size_t l;
3254
3255                                 /* Automatically search for the init system */
3256
3257                                 l = 1 + argc - optind;
3258                                 a = newa(char*, l + 1);
3259                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3260
3261                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3262                                 execve(a[0], a, env_use);
3263
3264                                 a[0] = (char*) "/lib/systemd/systemd";
3265                                 execve(a[0], a, env_use);
3266
3267                                 a[0] = (char*) "/sbin/init";
3268                                 execve(a[0], a, env_use);
3269                         } else if (argc > optind)
3270                                 execvpe(argv[optind], argv + optind, env_use);
3271                         else {
3272                                 chdir(home ? home : "/root");
3273                                 execle("/bin/bash", "-bash", NULL, env_use);
3274                                 execle("/bin/sh", "-sh", NULL, env_use);
3275                         }
3276
3277                         log_error_errno(errno, "execv() failed: %m");
3278                         _exit(EXIT_FAILURE);
3279                 }
3280
3281                 barrier_set_role(&barrier, BARRIER_PARENT);
3282                 fdset_free(fds);
3283                 fds = NULL;
3284
3285                 /* wait for child-setup to be done */
3286                 if (barrier_place_and_sync(&barrier)) {
3287                         _cleanup_event_unref_ sd_event *event = NULL;
3288                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3289                         int ifi = 0;
3290
3291                         r = move_network_interfaces(pid);
3292                         if (r < 0)
3293                                 goto finish;
3294
3295                         r = setup_veth(pid, veth_name, &ifi);
3296                         if (r < 0)
3297                                 goto finish;
3298
3299                         r = setup_bridge(veth_name, &ifi);
3300                         if (r < 0)
3301                                 goto finish;
3302
3303                         r = setup_macvlan(pid);
3304                         if (r < 0)
3305                                 goto finish;
3306
3307                         r = register_machine(pid, ifi);
3308                         if (r < 0)
3309                                 goto finish;
3310
3311                         /* Block SIGCHLD here, before notifying child.
3312                          * process_pty() will handle it with the other signals. */
3313                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3314                         if (r < 0)
3315                                 goto finish;
3316
3317                         /* Reset signal to default */
3318                         r = default_signals(SIGCHLD, -1);
3319                         if (r < 0)
3320                                 goto finish;
3321
3322                         /* Notify the child that the parent is ready with all
3323                          * its setup, and that the child can now hand over
3324                          * control to the code to run inside the container. */
3325                         (void)barrier_place(&barrier);
3326
3327                         r = sd_event_new(&event);
3328                         if (r < 0) {
3329                                 log_error_errno(r, "Failed to get default event source: %m");
3330                                 goto finish;
3331                         }
3332
3333                         if (arg_boot) {
3334                                 /* Try to kill the init system on SIGINT or SIGTERM */
3335                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3336                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3337                         } else {
3338                                 /* Immediately exit */
3339                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3340                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3341                         }
3342
3343                         /* simply exit on sigchld */
3344                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3345
3346                         r = pty_forward_new(event, master, &forward);
3347                         if (r < 0) {
3348                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3349                                 goto finish;
3350                         }
3351
3352                         r = sd_event_loop(event);
3353                         if (r < 0)
3354                                 return log_error_errno(r, "Failed to run event loop: %m");
3355
3356                         forward = pty_forward_free(forward);
3357
3358                         if (!arg_quiet)
3359                                 putc('\n', stdout);
3360
3361                         /* Kill if it is not dead yet anyway */
3362                         terminate_machine(pid);
3363                 }
3364
3365                 /* Normally redundant, but better safe than sorry */
3366                 kill(pid, SIGKILL);
3367
3368                 r = wait_for_container(pid, &container_status);
3369                 pid = 0;
3370
3371                 if (r < 0) {
3372                         /* We failed to wait for the container, or the
3373                          * container exited abnormally */
3374                         r = EXIT_FAILURE;
3375                         break;
3376                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3377                         /* The container exited with a non-zero
3378                          * status, or with zero status and no reboot
3379                          * was requested. */
3380                         break;
3381
3382                 /* CONTAINER_REBOOTED, loop again */
3383
3384                 if (arg_keep_unit) {
3385                         /* Special handling if we are running as a
3386                          * service: instead of simply restarting the
3387                          * machine we want to restart the entire
3388                          * service, so let's inform systemd about this
3389                          * with the special exit code 133. The service
3390                          * file uses RestartForceExitStatus=133 so
3391                          * that this results in a full nspawn
3392                          * restart. This is necessary since we might
3393                          * have cgroup parameters set we want to have
3394                          * flushed out. */
3395                         r = 133;
3396                         break;
3397                 }
3398         }
3399
3400 finish:
3401         sd_notify(false,
3402                   "STOPPING=1\n"
3403                   "STATUS=Terminating...");
3404
3405         loop_remove(loop_nr, &image_fd);
3406
3407         if (pid > 0)
3408                 kill(pid, SIGKILL);
3409
3410         free(arg_directory);
3411         free(arg_machine);
3412         free(arg_user);
3413         strv_free(arg_setenv);
3414         strv_free(arg_network_interfaces);
3415         strv_free(arg_network_macvlan);
3416         strv_free(arg_bind);
3417         strv_free(arg_bind_ro);
3418         strv_free(arg_tmpfs);
3419
3420         return r;
3421 }