chiark / gitweb /
932696aa9e1fae860c930f7a7ef656003774cb78
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static bool arg_link_journal_try = false;
128 static uint64_t arg_retain =
129         (1ULL << CAP_CHOWN) |
130         (1ULL << CAP_DAC_OVERRIDE) |
131         (1ULL << CAP_DAC_READ_SEARCH) |
132         (1ULL << CAP_FOWNER) |
133         (1ULL << CAP_FSETID) |
134         (1ULL << CAP_IPC_OWNER) |
135         (1ULL << CAP_KILL) |
136         (1ULL << CAP_LEASE) |
137         (1ULL << CAP_LINUX_IMMUTABLE) |
138         (1ULL << CAP_NET_BIND_SERVICE) |
139         (1ULL << CAP_NET_BROADCAST) |
140         (1ULL << CAP_NET_RAW) |
141         (1ULL << CAP_SETGID) |
142         (1ULL << CAP_SETFCAP) |
143         (1ULL << CAP_SETPCAP) |
144         (1ULL << CAP_SETUID) |
145         (1ULL << CAP_SYS_ADMIN) |
146         (1ULL << CAP_SYS_CHROOT) |
147         (1ULL << CAP_SYS_NICE) |
148         (1ULL << CAP_SYS_PTRACE) |
149         (1ULL << CAP_SYS_TTY_CONFIG) |
150         (1ULL << CAP_SYS_RESOURCE) |
151         (1ULL << CAP_SYS_BOOT) |
152         (1ULL << CAP_AUDIT_WRITE) |
153         (1ULL << CAP_AUDIT_CONTROL) |
154         (1ULL << CAP_MKNOD);
155 static char **arg_bind = NULL;
156 static char **arg_bind_ro = NULL;
157 static char **arg_tmpfs = NULL;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static bool arg_network_veth = false;
166 static const char *arg_network_bridge = NULL;
167 static unsigned long arg_personality = 0xffffffffLU;
168 static const char *arg_image = NULL;
169 static Volatile arg_volatile = VOLATILE_NO;
170
171 static void help(void) {
172         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174                "  -h --help                 Show this help\n"
175                "     --version              Print version string\n"
176                "  -q --quiet                Do not show status information\n"
177                "  -D --directory=PATH       Root directory for the container\n"
178                "  -i --image=PATH           File system device or image for the container\n"
179                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
180                "  -u --user=USER            Run the command under specified user or uid\n"
181                "  -M --machine=NAME         Set the machine name for the container\n"
182                "     --uuid=UUID            Set a specific machine UUID for the container\n"
183                "  -S --slice=SLICE          Place the container in the specified slice\n"
184                "     --private-network      Disable network in container\n"
185                "     --network-interface=INTERFACE\n"
186                "                            Assign an existing network interface to the\n"
187                "                            container\n"
188                "     --network-macvlan=INTERFACE\n"
189                "                            Create a macvlan network interface based on an\n"
190                "                            existing network interface to the container\n"
191                "     --network-veth         Add a virtual ethernet connection between host\n"
192                "                            and container\n"
193                "     --network-bridge=INTERFACE\n"
194                "                            Add a virtual ethernet connection between host\n"
195                "                            and container and add it to an existing bridge on\n"
196                "                            the host\n"
197                "  -Z --selinux-context=SECLABEL\n"
198                "                            Set the SELinux security context to be used by\n"
199                "                            processes in the container\n"
200                "  -L --selinux-apifs-context=SECLABEL\n"
201                "                            Set the SELinux security context to be used by\n"
202                "                            API/tmpfs file systems in the container\n"
203                "     --capability=CAP       In addition to the default, retain specified\n"
204                "                            capability\n"
205                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
206                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
207                "                            try-guest, try-host\n"
208                "  -j                        Equivalent to --link-journal=try-guest\n"
209                "     --read-only            Mount the root directory read-only\n"
210                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
211                "                            the container\n"
212                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
213                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
214                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
215                "     --share-system         Share system namespaces with host\n"
216                "     --register=BOOLEAN     Register container as machine\n"
217                "     --keep-unit            Do not register a scope for the machine, reuse\n"
218                "                            the service unit nspawn is running in\n"
219                "     --volatile[=MODE]      Run the system in volatile mode\n",
220                program_invocation_short_name);
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225         enum {
226                 ARG_VERSION = 0x100,
227                 ARG_PRIVATE_NETWORK,
228                 ARG_UUID,
229                 ARG_READ_ONLY,
230                 ARG_CAPABILITY,
231                 ARG_DROP_CAPABILITY,
232                 ARG_LINK_JOURNAL,
233                 ARG_BIND,
234                 ARG_BIND_RO,
235                 ARG_TMPFS,
236                 ARG_SETENV,
237                 ARG_SHARE_SYSTEM,
238                 ARG_REGISTER,
239                 ARG_KEEP_UNIT,
240                 ARG_NETWORK_INTERFACE,
241                 ARG_NETWORK_MACVLAN,
242                 ARG_NETWORK_VETH,
243                 ARG_NETWORK_BRIDGE,
244                 ARG_PERSONALITY,
245                 ARG_VOLATILE,
246         };
247
248         static const struct option options[] = {
249                 { "help",                  no_argument,       NULL, 'h'                   },
250                 { "version",               no_argument,       NULL, ARG_VERSION           },
251                 { "directory",             required_argument, NULL, 'D'                   },
252                 { "user",                  required_argument, NULL, 'u'                   },
253                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
254                 { "boot",                  no_argument,       NULL, 'b'                   },
255                 { "uuid",                  required_argument, NULL, ARG_UUID              },
256                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
257                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
258                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
259                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
260                 { "bind",                  required_argument, NULL, ARG_BIND              },
261                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
262                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
263                 { "machine",               required_argument, NULL, 'M'                   },
264                 { "slice",                 required_argument, NULL, 'S'                   },
265                 { "setenv",                required_argument, NULL, ARG_SETENV            },
266                 { "selinux-context",       required_argument, NULL, 'Z'                   },
267                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
268                 { "quiet",                 no_argument,       NULL, 'q'                   },
269                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
270                 { "register",              required_argument, NULL, ARG_REGISTER          },
271                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
272                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
273                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
274                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
275                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
276                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
277                 { "image",                 required_argument, NULL, 'i'                   },
278                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
279                 {}
280         };
281
282         int c, r;
283         uint64_t plus = 0, minus = 0;
284
285         assert(argc >= 0);
286         assert(argv);
287
288         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
289
290                 switch (c) {
291
292                 case 'h':
293                         help();
294                         return 0;
295
296                 case ARG_VERSION:
297                         puts(PACKAGE_STRING);
298                         puts(SYSTEMD_FEATURES);
299                         return 0;
300
301                 case 'D':
302                         free(arg_directory);
303                         arg_directory = canonicalize_file_name(optarg);
304                         if (!arg_directory) {
305                                 log_error_errno(errno, "Invalid root directory: %m");
306                                 return -ENOMEM;
307                         }
308
309                         break;
310
311                 case 'i':
312                         arg_image = optarg;
313                         break;
314
315                 case 'u':
316                         free(arg_user);
317                         arg_user = strdup(optarg);
318                         if (!arg_user)
319                                 return log_oom();
320
321                         break;
322
323                 case ARG_NETWORK_BRIDGE:
324                         arg_network_bridge = optarg;
325
326                         /* fall through */
327
328                 case ARG_NETWORK_VETH:
329                         arg_network_veth = true;
330                         arg_private_network = true;
331                         break;
332
333                 case ARG_NETWORK_INTERFACE:
334                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
335                                 return log_oom();
336
337                         arg_private_network = true;
338                         break;
339
340                 case ARG_NETWORK_MACVLAN:
341                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
342                                 return log_oom();
343
344                         /* fall through */
345
346                 case ARG_PRIVATE_NETWORK:
347                         arg_private_network = true;
348                         break;
349
350                 case 'b':
351                         arg_boot = true;
352                         break;
353
354                 case ARG_UUID:
355                         r = sd_id128_from_string(optarg, &arg_uuid);
356                         if (r < 0) {
357                                 log_error("Invalid UUID: %s", optarg);
358                                 return r;
359                         }
360                         break;
361
362                 case 'S':
363                         arg_slice = optarg;
364                         break;
365
366                 case 'M':
367                         if (isempty(optarg)) {
368                                 free(arg_machine);
369                                 arg_machine = NULL;
370                         } else {
371
372                                 if (!hostname_is_valid(optarg)) {
373                                         log_error("Invalid machine name: %s", optarg);
374                                         return -EINVAL;
375                                 }
376
377                                 free(arg_machine);
378                                 arg_machine = strdup(optarg);
379                                 if (!arg_machine)
380                                         return log_oom();
381
382                                 break;
383                         }
384
385                 case 'Z':
386                         arg_selinux_context = optarg;
387                         break;
388
389                 case 'L':
390                         arg_selinux_apifs_context = optarg;
391                         break;
392
393                 case ARG_READ_ONLY:
394                         arg_read_only = true;
395                         break;
396
397                 case ARG_CAPABILITY:
398                 case ARG_DROP_CAPABILITY: {
399                         const char *state, *word;
400                         size_t length;
401
402                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403                                 _cleanup_free_ char *t;
404                                 cap_value_t cap;
405
406                                 t = strndup(word, length);
407                                 if (!t)
408                                         return log_oom();
409
410                                 if (streq(t, "all")) {
411                                         if (c == ARG_CAPABILITY)
412                                                 plus = (uint64_t) -1;
413                                         else
414                                                 minus = (uint64_t) -1;
415                                 } else {
416                                         if (cap_from_name(t, &cap) < 0) {
417                                                 log_error("Failed to parse capability %s.", t);
418                                                 return -EINVAL;
419                                         }
420
421                                         if (c == ARG_CAPABILITY)
422                                                 plus |= 1ULL << (uint64_t) cap;
423                                         else
424                                                 minus |= 1ULL << (uint64_t) cap;
425                                 }
426                         }
427
428                         break;
429                 }
430
431                 case 'j':
432                         arg_link_journal = LINK_GUEST;
433                         arg_link_journal_try = true;
434                         break;
435
436                 case ARG_LINK_JOURNAL:
437                         if (streq(optarg, "auto"))
438                                 arg_link_journal = LINK_AUTO;
439                         else if (streq(optarg, "no"))
440                                 arg_link_journal = LINK_NO;
441                         else if (streq(optarg, "guest"))
442                                 arg_link_journal = LINK_GUEST;
443                         else if (streq(optarg, "host"))
444                                 arg_link_journal = LINK_HOST;
445                         else if (streq(optarg, "try-guest")) {
446                                 arg_link_journal = LINK_GUEST;
447                                 arg_link_journal_try = true;
448                         } else if (streq(optarg, "try-host")) {
449                                 arg_link_journal = LINK_HOST;
450                                 arg_link_journal_try = true;
451                         } else {
452                                 log_error("Failed to parse link journal mode %s", optarg);
453                                 return -EINVAL;
454                         }
455
456                         break;
457
458                 case ARG_BIND:
459                 case ARG_BIND_RO: {
460                         _cleanup_free_ char *a = NULL, *b = NULL;
461                         char *e;
462                         char ***x;
463
464                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466                         e = strchr(optarg, ':');
467                         if (e) {
468                                 a = strndup(optarg, e - optarg);
469                                 b = strdup(e + 1);
470                         } else {
471                                 a = strdup(optarg);
472                                 b = strdup(optarg);
473                         }
474
475                         if (!a || !b)
476                                 return log_oom();
477
478                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
479                                 log_error("Invalid bind mount specification: %s", optarg);
480                                 return -EINVAL;
481                         }
482
483                         r = strv_extend(x, a);
484                         if (r < 0)
485                                 return log_oom();
486
487                         r = strv_extend(x, b);
488                         if (r < 0)
489                                 return log_oom();
490
491                         break;
492                 }
493
494                 case ARG_TMPFS: {
495                         _cleanup_free_ char *a = NULL, *b = NULL;
496                         char *e;
497
498                         e = strchr(optarg, ':');
499                         if (e) {
500                                 a = strndup(optarg, e - optarg);
501                                 b = strdup(e + 1);
502                         } else {
503                                 a = strdup(optarg);
504                                 b = strdup("mode=0755");
505                         }
506
507                         if (!a || !b)
508                                 return log_oom();
509
510                         if (!path_is_absolute(a)) {
511                                 log_error("Invalid tmpfs specification: %s", optarg);
512                                 return -EINVAL;
513                         }
514
515                         r = strv_push(&arg_tmpfs, a);
516                         if (r < 0)
517                                 return log_oom();
518
519                         a = NULL;
520
521                         r = strv_push(&arg_tmpfs, b);
522                         if (r < 0)
523                                 return log_oom();
524
525                         b = NULL;
526
527                         break;
528                 }
529
530                 case ARG_SETENV: {
531                         char **n;
532
533                         if (!env_assignment_is_valid(optarg)) {
534                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
535                                 return -EINVAL;
536                         }
537
538                         n = strv_env_set(arg_setenv, optarg);
539                         if (!n)
540                                 return log_oom();
541
542                         strv_free(arg_setenv);
543                         arg_setenv = n;
544                         break;
545                 }
546
547                 case 'q':
548                         arg_quiet = true;
549                         break;
550
551                 case ARG_SHARE_SYSTEM:
552                         arg_share_system = true;
553                         break;
554
555                 case ARG_REGISTER:
556                         r = parse_boolean(optarg);
557                         if (r < 0) {
558                                 log_error("Failed to parse --register= argument: %s", optarg);
559                                 return r;
560                         }
561
562                         arg_register = r;
563                         break;
564
565                 case ARG_KEEP_UNIT:
566                         arg_keep_unit = true;
567                         break;
568
569                 case ARG_PERSONALITY:
570
571                         arg_personality = personality_from_string(optarg);
572                         if (arg_personality == 0xffffffffLU) {
573                                 log_error("Unknown or unsupported personality '%s'.", optarg);
574                                 return -EINVAL;
575                         }
576
577                         break;
578
579                 case ARG_VOLATILE:
580
581                         if (!optarg)
582                                 arg_volatile = VOLATILE_YES;
583                         else {
584                                 r = parse_boolean(optarg);
585                                 if (r < 0) {
586                                         if (streq(optarg, "state"))
587                                                 arg_volatile = VOLATILE_STATE;
588                                         else {
589                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
590                                                 return r;
591                                         }
592                                 } else
593                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594                         }
595
596                         break;
597
598                 case '?':
599                         return -EINVAL;
600
601                 default:
602                         assert_not_reached("Unhandled option");
603                 }
604
605         if (arg_share_system)
606                 arg_register = false;
607
608         if (arg_boot && arg_share_system) {
609                 log_error("--boot and --share-system may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614                 log_error("--keep-unit may not be used when invoked from a user session.");
615                 return -EINVAL;
616         }
617
618         if (arg_directory && arg_image) {
619                 log_error("--directory= and --image= may not be combined.");
620                 return -EINVAL;
621         }
622
623         if (arg_volatile != VOLATILE_NO && arg_read_only) {
624                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625                 return -EINVAL;
626         }
627
628         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
630         return 1;
631 }
632
633 static int mount_all(const char *dest) {
634
635         typedef struct MountPoint {
636                 const char *what;
637                 const char *where;
638                 const char *type;
639                 const char *options;
640                 unsigned long flags;
641                 bool fatal;
642         } MountPoint;
643
644         static const MountPoint mount_table[] = {
645                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
646                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
647                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
648                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
649                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
650                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
651                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
652                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
653 #ifdef HAVE_SELINUX
654                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
655                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
656 #endif
657         };
658
659         unsigned k;
660         int r = 0;
661
662         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
663                 _cleanup_free_ char *where = NULL;
664 #ifdef HAVE_SELINUX
665                 _cleanup_free_ char *options = NULL;
666 #endif
667                 const char *o;
668                 int t;
669
670                 where = strjoin(dest, "/", mount_table[k].where, NULL);
671                 if (!where)
672                         return log_oom();
673
674                 t = path_is_mount_point(where, true);
675                 if (t < 0) {
676                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
677
678                         if (r == 0)
679                                 r = t;
680
681                         continue;
682                 }
683
684                 /* Skip this entry if it is not a remount. */
685                 if (mount_table[k].what && t > 0)
686                         continue;
687
688                 t = mkdir_p(where, 0755);
689                 if (t < 0) {
690                         if (mount_table[k].fatal) {
691                                log_error_errno(t, "Failed to create directory %s: %m", where);
692
693                                 if (r == 0)
694                                         r = t;
695                         } else
696                                log_warning_errno(t, "Failed to create directory %s: %m", where);
697
698                         continue;
699                 }
700
701 #ifdef HAVE_SELINUX
702                 if (arg_selinux_apifs_context &&
703                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
705                         if (!options)
706                                 return log_oom();
707
708                         o = options;
709                 } else
710 #endif
711                         o = mount_table[k].options;
712
713
714                 if (mount(mount_table[k].what,
715                           where,
716                           mount_table[k].type,
717                           mount_table[k].flags,
718                           o) < 0) {
719
720                         if (mount_table[k].fatal) {
721                                 log_error_errno(errno, "mount(%s) failed: %m", where);
722
723                                 if (r == 0)
724                                         r = -errno;
725                         } else
726                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
727                 }
728         }
729
730         return r;
731 }
732
733 static int mount_binds(const char *dest, char **l, bool ro) {
734         char **x, **y;
735
736         STRV_FOREACH_PAIR(x, y, l) {
737                 _cleanup_free_ char *where = NULL;
738                 struct stat source_st, dest_st;
739                 int r;
740
741                 if (stat(*x, &source_st) < 0)
742                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
743
744                 where = strappend(dest, *y);
745                 if (!where)
746                         return log_oom();
747
748                 r = stat(where, &dest_st);
749                 if (r == 0) {
750                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
751                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
752                                 return -EINVAL;
753                         }
754                 } else if (errno == ENOENT) {
755                         r = mkdir_parents_label(where, 0755);
756                         if (r < 0)
757                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
758                 } else {
759                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
760                         return -errno;
761                 }
762
763                 /* Create the mount point, but be conservative -- refuse to create block
764                  * and char devices. */
765                 if (S_ISDIR(source_st.st_mode)) {
766                         r = mkdir_label(where, 0755);
767                         if (r < 0 && errno != EEXIST)
768                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
769                 } else if (S_ISFIFO(source_st.st_mode)) {
770                         r = mkfifo(where, 0644);
771                         if (r < 0 && errno != EEXIST)
772                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
773                 } else if (S_ISSOCK(source_st.st_mode)) {
774                         r = mknod(where, 0644 | S_IFSOCK, 0);
775                         if (r < 0 && errno != EEXIST)
776                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
777                 } else if (S_ISREG(source_st.st_mode)) {
778                         r = touch(where);
779                         if (r < 0)
780                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
781                 } else {
782                         log_error("Refusing to create mountpoint for file: %s", *x);
783                         return -ENOTSUP;
784                 }
785
786                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
787                         return log_error_errno(errno, "mount(%s) failed: %m", where);
788
789                 if (ro) {
790                         r = bind_remount_recursive(where, true);
791                         if (r < 0)
792                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
793                 }
794         }
795
796         return 0;
797 }
798
799 static int mount_tmpfs(const char *dest) {
800         char **i, **o;
801
802         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
803                 _cleanup_free_ char *where = NULL;
804                 int r;
805
806                 where = strappend(dest, *i);
807                 if (!where)
808                         return log_oom();
809
810                 r = mkdir_label(where, 0755);
811                 if (r < 0 && r != -EEXIST)
812                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
813
814                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
815                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
816         }
817
818         return 0;
819 }
820
821 static int setup_timezone(const char *dest) {
822         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
823         char *z, *y;
824         int r;
825
826         assert(dest);
827
828         /* Fix the timezone, if possible */
829         r = readlink_malloc("/etc/localtime", &p);
830         if (r < 0) {
831                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
832                 return 0;
833         }
834
835         z = path_startswith(p, "../usr/share/zoneinfo/");
836         if (!z)
837                 z = path_startswith(p, "/usr/share/zoneinfo/");
838         if (!z) {
839                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
840                 return 0;
841         }
842
843         where = strappend(dest, "/etc/localtime");
844         if (!where)
845                 return log_oom();
846
847         r = readlink_malloc(where, &q);
848         if (r >= 0) {
849                 y = path_startswith(q, "../usr/share/zoneinfo/");
850                 if (!y)
851                         y = path_startswith(q, "/usr/share/zoneinfo/");
852
853                 /* Already pointing to the right place? Then do nothing .. */
854                 if (y && streq(y, z))
855                         return 0;
856         }
857
858         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
859         if (!check)
860                 return log_oom();
861
862         if (access(check, F_OK) < 0) {
863                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
864                 return 0;
865         }
866
867         what = strappend("../usr/share/zoneinfo/", z);
868         if (!what)
869                 return log_oom();
870
871         r = mkdir_parents(where, 0755);
872         if (r < 0) {
873                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
874
875                 return 0;
876         }
877
878         r = unlink(where);
879         if (r < 0 && errno != ENOENT) {
880                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
881
882                 return 0;
883         }
884
885         if (symlink(what, where) < 0) {
886                 log_error_errno(errno, "Failed to correct timezone of container: %m");
887                 return 0;
888         }
889
890         return 0;
891 }
892
893 static int setup_resolv_conf(const char *dest) {
894         _cleanup_free_ char *where = NULL;
895         int r;
896
897         assert(dest);
898
899         if (arg_private_network)
900                 return 0;
901
902         /* Fix resolv.conf, if possible */
903         where = strappend(dest, "/etc/resolv.conf");
904         if (!where)
905                 return log_oom();
906
907         /* We don't really care for the results of this really. If it
908          * fails, it fails, but meh... */
909         r = mkdir_parents(where, 0755);
910         if (r < 0) {
911                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
912
913                 return 0;
914         }
915
916         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
917         if (r < 0) {
918                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
919
920                 return 0;
921         }
922
923         return 0;
924 }
925
926 static int setup_volatile_state(const char *directory) {
927         const char *p;
928         int r;
929
930         assert(directory);
931
932         if (arg_volatile != VOLATILE_STATE)
933                 return 0;
934
935         /* --volatile=state means we simply overmount /var
936            with a tmpfs, and the rest read-only. */
937
938         r = bind_remount_recursive(directory, true);
939         if (r < 0)
940                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
941
942         p = strappenda(directory, "/var");
943         r = mkdir(p, 0755);
944         if (r < 0 && errno != EEXIST)
945                 return log_error_errno(errno, "Failed to create %s: %m", directory);
946
947         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
948                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
949
950         return 0;
951 }
952
953 static int setup_volatile(const char *directory) {
954         bool tmpfs_mounted = false, bind_mounted = false;
955         char template[] = "/tmp/nspawn-volatile-XXXXXX";
956         const char *f, *t;
957         int r;
958
959         assert(directory);
960
961         if (arg_volatile != VOLATILE_YES)
962                 return 0;
963
964         /* --volatile=yes means we mount a tmpfs to the root dir, and
965            the original /usr to use inside it, and that read-only. */
966
967         if (!mkdtemp(template))
968                 return log_error_errno(errno, "Failed to create temporary directory: %m");
969
970         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
971                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
972                 r = -errno;
973                 goto fail;
974         }
975
976         tmpfs_mounted = true;
977
978         f = strappenda(directory, "/usr");
979         t = strappenda(template, "/usr");
980
981         r = mkdir(t, 0755);
982         if (r < 0 && errno != EEXIST) {
983                 log_error_errno(errno, "Failed to create %s: %m", t);
984                 r = -errno;
985                 goto fail;
986         }
987
988         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
989                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
990                 r = -errno;
991                 goto fail;
992         }
993
994         bind_mounted = true;
995
996         r = bind_remount_recursive(t, true);
997         if (r < 0) {
998                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
999                 goto fail;
1000         }
1001
1002         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1003                 log_error_errno(errno, "Failed to move root mount: %m");
1004                 r = -errno;
1005                 goto fail;
1006         }
1007
1008         rmdir(template);
1009
1010         return 0;
1011
1012 fail:
1013         if (bind_mounted)
1014                 umount(t);
1015         if (tmpfs_mounted)
1016                 umount(template);
1017         rmdir(template);
1018         return r;
1019 }
1020
1021 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1022
1023         snprintf(s, 37,
1024                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1025                  SD_ID128_FORMAT_VAL(id));
1026
1027         return s;
1028 }
1029
1030 static int setup_boot_id(const char *dest) {
1031         _cleanup_free_ char *from = NULL, *to = NULL;
1032         sd_id128_t rnd = {};
1033         char as_uuid[37];
1034         int r;
1035
1036         assert(dest);
1037
1038         if (arg_share_system)
1039                 return 0;
1040
1041         /* Generate a new randomized boot ID, so that each boot-up of
1042          * the container gets a new one */
1043
1044         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1045         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1046         if (!from || !to)
1047                 return log_oom();
1048
1049         r = sd_id128_randomize(&rnd);
1050         if (r < 0)
1051                 return log_error_errno(r, "Failed to generate random boot id: %m");
1052
1053         id128_format_as_uuid(rnd, as_uuid);
1054
1055         r = write_string_file(from, as_uuid);
1056         if (r < 0)
1057                 return log_error_errno(r, "Failed to write boot id: %m");
1058
1059         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1060                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1061                 r = -errno;
1062         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1063                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1064
1065         unlink(from);
1066         return r;
1067 }
1068
1069 static int copy_devnodes(const char *dest) {
1070
1071         static const char devnodes[] =
1072                 "null\0"
1073                 "zero\0"
1074                 "full\0"
1075                 "random\0"
1076                 "urandom\0"
1077                 "tty\0"
1078                 "net/tun\0";
1079
1080         const char *d;
1081         int r = 0;
1082         _cleanup_umask_ mode_t u;
1083
1084         assert(dest);
1085
1086         u = umask(0000);
1087
1088         NULSTR_FOREACH(d, devnodes) {
1089                 _cleanup_free_ char *from = NULL, *to = NULL;
1090                 struct stat st;
1091
1092                 from = strappend("/dev/", d);
1093                 to = strjoin(dest, "/dev/", d, NULL);
1094                 if (!from || !to)
1095                         return log_oom();
1096
1097                 if (stat(from, &st) < 0) {
1098
1099                         if (errno != ENOENT)
1100                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1101
1102                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1103
1104                         log_error("%s is not a char or block device, cannot copy", from);
1105                         return -EIO;
1106
1107                 } else {
1108                         r = mkdir_parents(to, 0775);
1109                         if (r < 0) {
1110                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1111                                 return -r;
1112                         }
1113
1114                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1115                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1116                 }
1117         }
1118
1119         return r;
1120 }
1121
1122 static int setup_ptmx(const char *dest) {
1123         _cleanup_free_ char *p = NULL;
1124
1125         p = strappend(dest, "/dev/ptmx");
1126         if (!p)
1127                 return log_oom();
1128
1129         if (symlink("pts/ptmx", p) < 0)
1130                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1131
1132         return 0;
1133 }
1134
1135 static int setup_dev_console(const char *dest, const char *console) {
1136         _cleanup_umask_ mode_t u;
1137         const char *to;
1138         struct stat st;
1139         int r;
1140
1141         assert(dest);
1142         assert(console);
1143
1144         u = umask(0000);
1145
1146         if (stat("/dev/null", &st) < 0)
1147                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1148
1149         r = chmod_and_chown(console, 0600, 0, 0);
1150         if (r < 0)
1151                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1152
1153         /* We need to bind mount the right tty to /dev/console since
1154          * ptys can only exist on pts file systems. To have something
1155          * to bind mount things on we create a device node first, and
1156          * use /dev/null for that since we the cgroups device policy
1157          * allows us to create that freely, while we cannot create
1158          * /dev/console. (Note that the major minor doesn't actually
1159          * matter here, since we mount it over anyway). */
1160
1161         to = strappenda(dest, "/dev/console");
1162         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1163                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1164
1165         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1166                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1167
1168         return 0;
1169 }
1170
1171 static int setup_kmsg(const char *dest, int kmsg_socket) {
1172         _cleanup_free_ char *from = NULL, *to = NULL;
1173         int r, fd, k;
1174         _cleanup_umask_ mode_t u;
1175         union {
1176                 struct cmsghdr cmsghdr;
1177                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1178         } control = {};
1179         struct msghdr mh = {
1180                 .msg_control = &control,
1181                 .msg_controllen = sizeof(control),
1182         };
1183         struct cmsghdr *cmsg;
1184
1185         assert(dest);
1186         assert(kmsg_socket >= 0);
1187
1188         u = umask(0000);
1189
1190         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1191          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1192          * on the reading side behave very similar to /proc/kmsg,
1193          * their writing side behaves differently from /dev/kmsg in
1194          * that writing blocks when nothing is reading. In order to
1195          * avoid any problems with containers deadlocking due to this
1196          * we simply make /dev/kmsg unavailable to the container. */
1197         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1198             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1199                 return log_oom();
1200
1201         if (mkfifo(from, 0600) < 0)
1202                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1203
1204         r = chmod_and_chown(from, 0600, 0, 0);
1205         if (r < 0)
1206                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1207
1208         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1209                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1210
1211         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1212         if (fd < 0)
1213                 return log_error_errno(errno, "Failed to open fifo: %m");
1214
1215         cmsg = CMSG_FIRSTHDR(&mh);
1216         cmsg->cmsg_level = SOL_SOCKET;
1217         cmsg->cmsg_type = SCM_RIGHTS;
1218         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1219         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1220
1221         mh.msg_controllen = cmsg->cmsg_len;
1222
1223         /* Store away the fd in the socket, so that it stays open as
1224          * long as we run the child */
1225         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1226         safe_close(fd);
1227
1228         if (k < 0)
1229                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1230
1231         /* And now make the FIFO unavailable as /dev/kmsg... */
1232         unlink(from);
1233         return 0;
1234 }
1235
1236 static int setup_hostname(void) {
1237
1238         if (arg_share_system)
1239                 return 0;
1240
1241         if (sethostname_idempotent(arg_machine) < 0)
1242                 return -errno;
1243
1244         return 0;
1245 }
1246
1247 static int setup_journal(const char *directory) {
1248         sd_id128_t machine_id, this_id;
1249         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1250         char *id;
1251         int r;
1252
1253         p = strappend(directory, "/etc/machine-id");
1254         if (!p)
1255                 return log_oom();
1256
1257         r = read_one_line_file(p, &b);
1258         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1259                 return 0;
1260         else if (r < 0)
1261                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1262
1263         id = strstrip(b);
1264         if (isempty(id) && arg_link_journal == LINK_AUTO)
1265                 return 0;
1266
1267         /* Verify validity */
1268         r = sd_id128_from_string(id, &machine_id);
1269         if (r < 0)
1270                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1271
1272         r = sd_id128_get_machine(&this_id);
1273         if (r < 0)
1274                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1275
1276         if (sd_id128_equal(machine_id, this_id)) {
1277                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1278                          "Host and machine ids are equal (%s): refusing to link journals", id);
1279                 if (arg_link_journal == LINK_AUTO)
1280                         return 0;
1281                 return
1282                         -EEXIST;
1283         }
1284
1285         if (arg_link_journal == LINK_NO)
1286                 return 0;
1287
1288         free(p);
1289         p = strappend("/var/log/journal/", id);
1290         q = strjoin(directory, "/var/log/journal/", id, NULL);
1291         if (!p || !q)
1292                 return log_oom();
1293
1294         if (path_is_mount_point(p, false) > 0) {
1295                 if (arg_link_journal != LINK_AUTO) {
1296                         log_error("%s: already a mount point, refusing to use for journal", p);
1297                         return -EEXIST;
1298                 }
1299
1300                 return 0;
1301         }
1302
1303         if (path_is_mount_point(q, false) > 0) {
1304                 if (arg_link_journal != LINK_AUTO) {
1305                         log_error("%s: already a mount point, refusing to use for journal", q);
1306                         return -EEXIST;
1307                 }
1308
1309                 return 0;
1310         }
1311
1312         r = readlink_and_make_absolute(p, &d);
1313         if (r >= 0) {
1314                 if ((arg_link_journal == LINK_GUEST ||
1315                      arg_link_journal == LINK_AUTO) &&
1316                     path_equal(d, q)) {
1317
1318                         r = mkdir_p(q, 0755);
1319                         if (r < 0)
1320                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1321                         return 0;
1322                 }
1323
1324                 if (unlink(p) < 0)
1325                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1326         } else if (r == -EINVAL) {
1327
1328                 if (arg_link_journal == LINK_GUEST &&
1329                     rmdir(p) < 0) {
1330
1331                         if (errno == ENOTDIR) {
1332                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1333                                 return r;
1334                         } else {
1335                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1336                                 return -errno;
1337                         }
1338                 }
1339         } else if (r != -ENOENT) {
1340                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1341                 return r;
1342         }
1343
1344         if (arg_link_journal == LINK_GUEST) {
1345
1346                 if (symlink(q, p) < 0) {
1347                         if (arg_link_journal_try) {
1348                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1349                                 return 0;
1350                         } else {
1351                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1352                                 return -errno;
1353                         }
1354                 }
1355
1356                 r = mkdir_p(q, 0755);
1357                 if (r < 0)
1358                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1359                 return 0;
1360         }
1361
1362         if (arg_link_journal == LINK_HOST) {
1363                 /* don't create parents here -- if the host doesn't have
1364                  * permanent journal set up, don't force it here */
1365                 r = mkdir(p, 0755);
1366                 if (r < 0) {
1367                         if (arg_link_journal_try) {
1368                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1369                                 return 0;
1370                         } else {
1371                                 log_error_errno(errno, "Failed to create %s: %m", p);
1372                                 return r;
1373                         }
1374                 }
1375
1376         } else if (access(p, F_OK) < 0)
1377                 return 0;
1378
1379         if (dir_is_empty(q) == 0)
1380                 log_warning("%s is not empty, proceeding anyway.", q);
1381
1382         r = mkdir_p(q, 0755);
1383         if (r < 0) {
1384                 log_error_errno(errno, "Failed to create %s: %m", q);
1385                 return r;
1386         }
1387
1388         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1389                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1390
1391         return 0;
1392 }
1393
1394 static int drop_capabilities(void) {
1395         return capability_bounding_set_drop(~arg_retain, false);
1396 }
1397
1398 static int register_machine(pid_t pid, int local_ifindex) {
1399         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1400         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1401         int r;
1402
1403         if (!arg_register)
1404                 return 0;
1405
1406         r = sd_bus_default_system(&bus);
1407         if (r < 0)
1408                 return log_error_errno(r, "Failed to open system bus: %m");
1409
1410         if (arg_keep_unit) {
1411                 r = sd_bus_call_method(
1412                                 bus,
1413                                 "org.freedesktop.machine1",
1414                                 "/org/freedesktop/machine1",
1415                                 "org.freedesktop.machine1.Manager",
1416                                 "RegisterMachineWithNetwork",
1417                                 &error,
1418                                 NULL,
1419                                 "sayssusai",
1420                                 arg_machine,
1421                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1422                                 "nspawn",
1423                                 "container",
1424                                 (uint32_t) pid,
1425                                 strempty(arg_directory),
1426                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1427         } else {
1428                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1429
1430                 r = sd_bus_message_new_method_call(
1431                                 bus,
1432                                 &m,
1433                                 "org.freedesktop.machine1",
1434                                 "/org/freedesktop/machine1",
1435                                 "org.freedesktop.machine1.Manager",
1436                                 "CreateMachineWithNetwork");
1437                 if (r < 0)
1438                         return log_error_errno(r, "Failed to create message: %m");
1439
1440                 r = sd_bus_message_append(
1441                                 m,
1442                                 "sayssusai",
1443                                 arg_machine,
1444                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1445                                 "nspawn",
1446                                 "container",
1447                                 (uint32_t) pid,
1448                                 strempty(arg_directory),
1449                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1450                 if (r < 0)
1451                         return log_error_errno(r, "Failed to append message arguments: %m");
1452
1453                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1454                 if (r < 0)
1455                         return log_error_errno(r, "Failed to open container: %m");
1456
1457                 if (!isempty(arg_slice)) {
1458                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1459                         if (r < 0)
1460                                 return log_error_errno(r, "Failed to append slice: %m");
1461                 }
1462
1463                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1464                 if (r < 0)
1465                         return log_error_errno(r, "Failed to add device policy: %m");
1466
1467                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1468                                           /* Allow the container to
1469                                            * access and create the API
1470                                            * device nodes, so that
1471                                            * PrivateDevices= in the
1472                                            * container can work
1473                                            * fine */
1474                                           "/dev/null", "rwm",
1475                                           "/dev/zero", "rwm",
1476                                           "/dev/full", "rwm",
1477                                           "/dev/random", "rwm",
1478                                           "/dev/urandom", "rwm",
1479                                           "/dev/tty", "rwm",
1480                                           "/dev/net/tun", "rwm",
1481                                           /* Allow the container
1482                                            * access to ptys. However,
1483                                            * do not permit the
1484                                            * container to ever create
1485                                            * these device nodes. */
1486                                           "/dev/pts/ptmx", "rw",
1487                                           "char-pts", "rw");
1488                 if (r < 0)
1489                         return log_error_errno(r, "Failed to add device whitelist: %m");
1490
1491                 r = sd_bus_message_close_container(m);
1492                 if (r < 0)
1493                         return log_error_errno(r, "Failed to close container: %m");
1494
1495                 r = sd_bus_call(bus, m, 0, &error, NULL);
1496         }
1497
1498         if (r < 0) {
1499                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1500                 return r;
1501         }
1502
1503         return 0;
1504 }
1505
1506 static int terminate_machine(pid_t pid) {
1507         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1508         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1509         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1510         const char *path;
1511         int r;
1512
1513         if (!arg_register)
1514                 return 0;
1515
1516         r = sd_bus_default_system(&bus);
1517         if (r < 0)
1518                 return log_error_errno(r, "Failed to open system bus: %m");
1519
1520         r = sd_bus_call_method(
1521                         bus,
1522                         "org.freedesktop.machine1",
1523                         "/org/freedesktop/machine1",
1524                         "org.freedesktop.machine1.Manager",
1525                         "GetMachineByPID",
1526                         &error,
1527                         &reply,
1528                         "u",
1529                         (uint32_t) pid);
1530         if (r < 0) {
1531                 /* Note that the machine might already have been
1532                  * cleaned up automatically, hence don't consider it a
1533                  * failure if we cannot get the machine object. */
1534                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1535                 return 0;
1536         }
1537
1538         r = sd_bus_message_read(reply, "o", &path);
1539         if (r < 0)
1540                 return bus_log_parse_error(r);
1541
1542         r = sd_bus_call_method(
1543                         bus,
1544                         "org.freedesktop.machine1",
1545                         path,
1546                         "org.freedesktop.machine1.Machine",
1547                         "Terminate",
1548                         &error,
1549                         NULL,
1550                         NULL);
1551         if (r < 0) {
1552                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1553                 return 0;
1554         }
1555
1556         return 0;
1557 }
1558
1559 static int reset_audit_loginuid(void) {
1560         _cleanup_free_ char *p = NULL;
1561         int r;
1562
1563         if (arg_share_system)
1564                 return 0;
1565
1566         r = read_one_line_file("/proc/self/loginuid", &p);
1567         if (r == -ENOENT)
1568                 return 0;
1569         if (r < 0)
1570                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1571
1572         /* Already reset? */
1573         if (streq(p, "4294967295"))
1574                 return 0;
1575
1576         r = write_string_file("/proc/self/loginuid", "4294967295");
1577         if (r < 0) {
1578                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1579                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1580                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1581                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1582                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1583
1584                 sleep(5);
1585         }
1586
1587         return 0;
1588 }
1589
1590 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1591 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1592 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1593
1594 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1595         uint8_t result[8];
1596         size_t l, sz;
1597         uint8_t *v, *i;
1598         int r;
1599
1600         l = strlen(arg_machine);
1601         sz = sizeof(sd_id128_t) + l;
1602         if (idx > 0)
1603                 sz += sizeof(idx);
1604
1605         v = alloca(sz);
1606
1607         /* fetch some persistent data unique to the host */
1608         r = sd_id128_get_machine((sd_id128_t*) v);
1609         if (r < 0)
1610                 return r;
1611
1612         /* combine with some data unique (on this host) to this
1613          * container instance */
1614         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1615         if (idx > 0) {
1616                 idx = htole64(idx);
1617                 memcpy(i, &idx, sizeof(idx));
1618         }
1619
1620         /* Let's hash the host machine ID plus the container name. We
1621          * use a fixed, but originally randomly created hash key here. */
1622         siphash24(result, v, sz, hash_key.bytes);
1623
1624         assert_cc(ETH_ALEN <= sizeof(result));
1625         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1626
1627         /* see eth_random_addr in the kernel */
1628         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1629         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1630
1631         return 0;
1632 }
1633
1634 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1635         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1636         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1637         struct ether_addr mac_host, mac_container;
1638         int r, i;
1639
1640         if (!arg_private_network)
1641                 return 0;
1642
1643         if (!arg_network_veth)
1644                 return 0;
1645
1646         /* Use two different interface name prefixes depending whether
1647          * we are in bridge mode or not. */
1648         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1649                  arg_network_bridge ? "vb" : "ve", arg_machine);
1650
1651         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1652         if (r < 0)
1653                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1654
1655         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1656         if (r < 0)
1657                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1658
1659         r = sd_rtnl_open(&rtnl, 0);
1660         if (r < 0)
1661                 return log_error_errno(r, "Failed to connect to netlink: %m");
1662
1663         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1664         if (r < 0)
1665                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1666
1667         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1668         if (r < 0)
1669                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1670
1671         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1672         if (r < 0)
1673                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1674
1675         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1676         if (r < 0)
1677                 return log_error_errno(r, "Failed to open netlink container: %m");
1678
1679         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1680         if (r < 0)
1681                 return log_error_errno(r, "Failed to open netlink container: %m");
1682
1683         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1684         if (r < 0)
1685                 return log_error_errno(r, "Failed to open netlink container: %m");
1686
1687         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1688         if (r < 0)
1689                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1690
1691         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1692         if (r < 0)
1693                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1694
1695         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1696         if (r < 0)
1697                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1698
1699         r = sd_rtnl_message_close_container(m);
1700         if (r < 0)
1701                 return log_error_errno(r, "Failed to close netlink container: %m");
1702
1703         r = sd_rtnl_message_close_container(m);
1704         if (r < 0)
1705                 return log_error_errno(r, "Failed to close netlink container: %m");
1706
1707         r = sd_rtnl_message_close_container(m);
1708         if (r < 0)
1709                 return log_error_errno(r, "Failed to close netlink container: %m");
1710
1711         r = sd_rtnl_call(rtnl, m, 0, NULL);
1712         if (r < 0)
1713                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1714
1715         i = (int) if_nametoindex(iface_name);
1716         if (i <= 0)
1717                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1718
1719         *ifi = i;
1720
1721         return 0;
1722 }
1723
1724 static int setup_bridge(const char veth_name[], int *ifi) {
1725         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1726         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1727         int r, bridge;
1728
1729         if (!arg_private_network)
1730                 return 0;
1731
1732         if (!arg_network_veth)
1733                 return 0;
1734
1735         if (!arg_network_bridge)
1736                 return 0;
1737
1738         bridge = (int) if_nametoindex(arg_network_bridge);
1739         if (bridge <= 0)
1740                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1741
1742         *ifi = bridge;
1743
1744         r = sd_rtnl_open(&rtnl, 0);
1745         if (r < 0)
1746                 return log_error_errno(r, "Failed to connect to netlink: %m");
1747
1748         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1749         if (r < 0)
1750                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1751
1752         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1753         if (r < 0)
1754                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1755
1756         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1757         if (r < 0)
1758                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1759
1760         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1761         if (r < 0)
1762                 return log_error_errno(r, "Failed to add netlink master field: %m");
1763
1764         r = sd_rtnl_call(rtnl, m, 0, NULL);
1765         if (r < 0)
1766                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1767
1768         return 0;
1769 }
1770
1771 static int parse_interface(struct udev *udev, const char *name) {
1772         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1773         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1774         int ifi;
1775
1776         ifi = (int) if_nametoindex(name);
1777         if (ifi <= 0)
1778                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1779
1780         sprintf(ifi_str, "n%i", ifi);
1781         d = udev_device_new_from_device_id(udev, ifi_str);
1782         if (!d)
1783                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1784
1785         if (udev_device_get_is_initialized(d) <= 0) {
1786                 log_error("Network interface %s is not initialized yet.", name);
1787                 return -EBUSY;
1788         }
1789
1790         return ifi;
1791 }
1792
1793 static int move_network_interfaces(pid_t pid) {
1794         _cleanup_udev_unref_ struct udev *udev = NULL;
1795         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1796         char **i;
1797         int r;
1798
1799         if (!arg_private_network)
1800                 return 0;
1801
1802         if (strv_isempty(arg_network_interfaces))
1803                 return 0;
1804
1805         r = sd_rtnl_open(&rtnl, 0);
1806         if (r < 0)
1807                 return log_error_errno(r, "Failed to connect to netlink: %m");
1808
1809         udev = udev_new();
1810         if (!udev) {
1811                 log_error("Failed to connect to udev.");
1812                 return -ENOMEM;
1813         }
1814
1815         STRV_FOREACH(i, arg_network_interfaces) {
1816                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1817                 int ifi;
1818
1819                 ifi = parse_interface(udev, *i);
1820                 if (ifi < 0)
1821                         return ifi;
1822
1823                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1824                 if (r < 0)
1825                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1826
1827                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1828                 if (r < 0)
1829                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1830
1831                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1832                 if (r < 0)
1833                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1834         }
1835
1836         return 0;
1837 }
1838
1839 static int setup_macvlan(pid_t pid) {
1840         _cleanup_udev_unref_ struct udev *udev = NULL;
1841         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1842         unsigned idx = 0;
1843         char **i;
1844         int r;
1845
1846         if (!arg_private_network)
1847                 return 0;
1848
1849         if (strv_isempty(arg_network_macvlan))
1850                 return 0;
1851
1852         r = sd_rtnl_open(&rtnl, 0);
1853         if (r < 0)
1854                 return log_error_errno(r, "Failed to connect to netlink: %m");
1855
1856         udev = udev_new();
1857         if (!udev) {
1858                 log_error("Failed to connect to udev.");
1859                 return -ENOMEM;
1860         }
1861
1862         STRV_FOREACH(i, arg_network_macvlan) {
1863                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1864                 _cleanup_free_ char *n = NULL;
1865                 struct ether_addr mac;
1866                 int ifi;
1867
1868                 ifi = parse_interface(udev, *i);
1869                 if (ifi < 0)
1870                         return ifi;
1871
1872                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1873                 if (r < 0)
1874                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1875
1876                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1877                 if (r < 0)
1878                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1879
1880                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1881                 if (r < 0)
1882                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1883
1884                 n = strappend("mv-", *i);
1885                 if (!n)
1886                         return log_oom();
1887
1888                 strshorten(n, IFNAMSIZ-1);
1889
1890                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1891                 if (r < 0)
1892                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1893
1894                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1895                 if (r < 0)
1896                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1897
1898                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1899                 if (r < 0)
1900                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1901
1902                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1903                 if (r < 0)
1904                         return log_error_errno(r, "Failed to open netlink container: %m");
1905
1906                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1907                 if (r < 0)
1908                         return log_error_errno(r, "Failed to open netlink container: %m");
1909
1910                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1911                 if (r < 0)
1912                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1913
1914                 r = sd_rtnl_message_close_container(m);
1915                 if (r < 0)
1916                         return log_error_errno(r, "Failed to close netlink container: %m");
1917
1918                 r = sd_rtnl_message_close_container(m);
1919                 if (r < 0)
1920                         return log_error_errno(r, "Failed to close netlink container: %m");
1921
1922                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1923                 if (r < 0)
1924                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1925         }
1926
1927         return 0;
1928 }
1929
1930 static int setup_seccomp(void) {
1931
1932 #ifdef HAVE_SECCOMP
1933         static const int blacklist[] = {
1934                 SCMP_SYS(kexec_load),
1935                 SCMP_SYS(open_by_handle_at),
1936                 SCMP_SYS(init_module),
1937                 SCMP_SYS(finit_module),
1938                 SCMP_SYS(delete_module),
1939                 SCMP_SYS(iopl),
1940                 SCMP_SYS(ioperm),
1941                 SCMP_SYS(swapon),
1942                 SCMP_SYS(swapoff),
1943         };
1944
1945         scmp_filter_ctx seccomp;
1946         unsigned i;
1947         int r;
1948
1949         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1950         if (!seccomp)
1951                 return log_oom();
1952
1953         r = seccomp_add_secondary_archs(seccomp);
1954         if (r < 0) {
1955                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1956                 goto finish;
1957         }
1958
1959         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1960                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1961                 if (r == -EFAULT)
1962                         continue; /* unknown syscall */
1963                 if (r < 0) {
1964                         log_error_errno(r, "Failed to block syscall: %m");
1965                         goto finish;
1966                 }
1967         }
1968
1969         /*
1970            Audit is broken in containers, much of the userspace audit
1971            hookup will fail if running inside a container. We don't
1972            care and just turn off creation of audit sockets.
1973
1974            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1975            with EAFNOSUPPORT which audit userspace uses as indication
1976            that audit is disabled in the kernel.
1977          */
1978
1979         r = seccomp_rule_add(
1980                         seccomp,
1981                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1982                         SCMP_SYS(socket),
1983                         2,
1984                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1985                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1986         if (r < 0) {
1987                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1988                 goto finish;
1989         }
1990
1991         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1992         if (r < 0) {
1993                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1994                 goto finish;
1995         }
1996
1997         r = seccomp_load(seccomp);
1998         if (r < 0)
1999                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2000
2001 finish:
2002         seccomp_release(seccomp);
2003         return r;
2004 #else
2005         return 0;
2006 #endif
2007
2008 }
2009
2010 static int setup_image(char **device_path, int *loop_nr) {
2011         struct loop_info64 info = {
2012                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2013         };
2014         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2015         _cleanup_free_ char* loopdev = NULL;
2016         struct stat st;
2017         int r, nr;
2018
2019         assert(device_path);
2020         assert(loop_nr);
2021
2022         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2023         if (fd < 0)
2024                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2025
2026         if (fstat(fd, &st) < 0)
2027                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2028
2029         if (S_ISBLK(st.st_mode)) {
2030                 char *p;
2031
2032                 p = strdup(arg_image);
2033                 if (!p)
2034                         return log_oom();
2035
2036                 *device_path = p;
2037
2038                 *loop_nr = -1;
2039
2040                 r = fd;
2041                 fd = -1;
2042
2043                 return r;
2044         }
2045
2046         if (!S_ISREG(st.st_mode)) {
2047                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2048                 return -EINVAL;
2049         }
2050
2051         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2052         if (control < 0)
2053                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2054
2055         nr = ioctl(control, LOOP_CTL_GET_FREE);
2056         if (nr < 0)
2057                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2058
2059         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2060                 return log_oom();
2061
2062         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2063         if (loop < 0)
2064                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2065
2066         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2067                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2068
2069         if (arg_read_only)
2070                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2071
2072         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2073                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2074
2075         *device_path = loopdev;
2076         loopdev = NULL;
2077
2078         *loop_nr = nr;
2079
2080         r = loop;
2081         loop = -1;
2082
2083         return r;
2084 }
2085
2086 static int dissect_image(
2087                 int fd,
2088                 char **root_device, bool *root_device_rw,
2089                 char **home_device, bool *home_device_rw,
2090                 char **srv_device, bool *srv_device_rw,
2091                 bool *secondary) {
2092
2093 #ifdef HAVE_BLKID
2094         int home_nr = -1, srv_nr = -1;
2095 #ifdef GPT_ROOT_NATIVE
2096         int root_nr = -1;
2097 #endif
2098 #ifdef GPT_ROOT_SECONDARY
2099         int secondary_root_nr = -1;
2100 #endif
2101
2102         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2103         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2104         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2105         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2106         _cleanup_udev_unref_ struct udev *udev = NULL;
2107         struct udev_list_entry *first, *item;
2108         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2109         const char *pttype = NULL;
2110         blkid_partlist pl;
2111         struct stat st;
2112         int r;
2113
2114         assert(fd >= 0);
2115         assert(root_device);
2116         assert(home_device);
2117         assert(srv_device);
2118         assert(secondary);
2119
2120         b = blkid_new_probe();
2121         if (!b)
2122                 return log_oom();
2123
2124         errno = 0;
2125         r = blkid_probe_set_device(b, fd, 0, 0);
2126         if (r != 0) {
2127                 if (errno == 0)
2128                         return log_oom();
2129
2130                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2131                 return -errno;
2132         }
2133
2134         blkid_probe_enable_partitions(b, 1);
2135         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2136
2137         errno = 0;
2138         r = blkid_do_safeprobe(b);
2139         if (r == -2 || r == 1) {
2140                 log_error("Failed to identify any partition table on %s.\n"
2141                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2142                 return -EINVAL;
2143         } else if (r != 0) {
2144                 if (errno == 0)
2145                         errno = EIO;
2146                 log_error_errno(errno, "Failed to probe: %m");
2147                 return -errno;
2148         }
2149
2150         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2151         if (!streq_ptr(pttype, "gpt")) {
2152                 log_error("Image %s does not carry a GUID Partition Table.\n"
2153                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2154                 return -EINVAL;
2155         }
2156
2157         errno = 0;
2158         pl = blkid_probe_get_partitions(b);
2159         if (!pl) {
2160                 if (errno == 0)
2161                         return log_oom();
2162
2163                 log_error("Failed to list partitions of %s", arg_image);
2164                 return -errno;
2165         }
2166
2167         udev = udev_new();
2168         if (!udev)
2169                 return log_oom();
2170
2171         if (fstat(fd, &st) < 0)
2172                 return log_error_errno(errno, "Failed to stat block device: %m");
2173
2174         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2175         if (!d)
2176                 return log_oom();
2177
2178         e = udev_enumerate_new(udev);
2179         if (!e)
2180                 return log_oom();
2181
2182         r = udev_enumerate_add_match_parent(e, d);
2183         if (r < 0)
2184                 return log_oom();
2185
2186         r = udev_enumerate_scan_devices(e);
2187         if (r < 0)
2188                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2189
2190         first = udev_enumerate_get_list_entry(e);
2191         udev_list_entry_foreach(item, first) {
2192                 _cleanup_udev_device_unref_ struct udev_device *q;
2193                 const char *stype, *node;
2194                 unsigned long long flags;
2195                 sd_id128_t type_id;
2196                 blkid_partition pp;
2197                 dev_t qn;
2198                 int nr;
2199
2200                 errno = 0;
2201                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2202                 if (!q) {
2203                         if (!errno)
2204                                 errno = ENOMEM;
2205
2206                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2207                         return -errno;
2208                 }
2209
2210                 qn = udev_device_get_devnum(q);
2211                 if (major(qn) == 0)
2212                         continue;
2213
2214                 if (st.st_rdev == qn)
2215                         continue;
2216
2217                 node = udev_device_get_devnode(q);
2218                 if (!node)
2219                         continue;
2220
2221                 pp = blkid_partlist_devno_to_partition(pl, qn);
2222                 if (!pp)
2223                         continue;
2224
2225                 flags = blkid_partition_get_flags(pp);
2226                 if (flags & GPT_FLAG_NO_AUTO)
2227                         continue;
2228
2229                 nr = blkid_partition_get_partno(pp);
2230                 if (nr < 0)
2231                         continue;
2232
2233                 stype = blkid_partition_get_type_string(pp);
2234                 if (!stype)
2235                         continue;
2236
2237                 if (sd_id128_from_string(stype, &type_id) < 0)
2238                         continue;
2239
2240                 if (sd_id128_equal(type_id, GPT_HOME)) {
2241
2242                         if (home && nr >= home_nr)
2243                                 continue;
2244
2245                         home_nr = nr;
2246                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2247
2248                         free(home);
2249                         home = strdup(node);
2250                         if (!home)
2251                                 return log_oom();
2252                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2253
2254                         if (srv && nr >= srv_nr)
2255                                 continue;
2256
2257                         srv_nr = nr;
2258                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2259
2260                         free(srv);
2261                         srv = strdup(node);
2262                         if (!srv)
2263                                 return log_oom();
2264                 }
2265 #ifdef GPT_ROOT_NATIVE
2266                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2267
2268                         if (root && nr >= root_nr)
2269                                 continue;
2270
2271                         root_nr = nr;
2272                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2273
2274                         free(root);
2275                         root = strdup(node);
2276                         if (!root)
2277                                 return log_oom();
2278                 }
2279 #endif
2280 #ifdef GPT_ROOT_SECONDARY
2281                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2282
2283                         if (secondary_root && nr >= secondary_root_nr)
2284                                 continue;
2285
2286                         secondary_root_nr = nr;
2287                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2288
2289
2290                         free(secondary_root);
2291                         secondary_root = strdup(node);
2292                         if (!secondary_root)
2293                                 return log_oom();
2294                 }
2295 #endif
2296         }
2297
2298         if (!root && !secondary_root) {
2299                 log_error("Failed to identify root partition in disk image %s.\n"
2300                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2301                 return -EINVAL;
2302         }
2303
2304         if (root) {
2305                 *root_device = root;
2306                 root = NULL;
2307
2308                 *root_device_rw = root_rw;
2309                 *secondary = false;
2310         } else if (secondary_root) {
2311                 *root_device = secondary_root;
2312                 secondary_root = NULL;
2313
2314                 *root_device_rw = secondary_root_rw;
2315                 *secondary = true;
2316         }
2317
2318         if (home) {
2319                 *home_device = home;
2320                 home = NULL;
2321
2322                 *home_device_rw = home_rw;
2323         }
2324
2325         if (srv) {
2326                 *srv_device = srv;
2327                 srv = NULL;
2328
2329                 *srv_device_rw = srv_rw;
2330         }
2331
2332         return 0;
2333 #else
2334         log_error("--image= is not supported, compiled without blkid support.");
2335         return -ENOTSUP;
2336 #endif
2337 }
2338
2339 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2340 #ifdef HAVE_BLKID
2341         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2342         const char *fstype, *p;
2343         int r;
2344
2345         assert(what);
2346         assert(where);
2347
2348         if (arg_read_only)
2349                 rw = false;
2350
2351         if (directory)
2352                 p = strappenda(where, directory);
2353         else
2354                 p = where;
2355
2356         errno = 0;
2357         b = blkid_new_probe_from_filename(what);
2358         if (!b) {
2359                 if (errno == 0)
2360                         return log_oom();
2361                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2362                 return -errno;
2363         }
2364
2365         blkid_probe_enable_superblocks(b, 1);
2366         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2367
2368         errno = 0;
2369         r = blkid_do_safeprobe(b);
2370         if (r == -1 || r == 1) {
2371                 log_error("Cannot determine file system type of %s", what);
2372                 return -EINVAL;
2373         } else if (r != 0) {
2374                 if (errno == 0)
2375                         errno = EIO;
2376                 log_error_errno(errno, "Failed to probe %s: %m", what);
2377                 return -errno;
2378         }
2379
2380         errno = 0;
2381         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2382                 if (errno == 0)
2383                         errno = EINVAL;
2384                 log_error("Failed to determine file system type of %s", what);
2385                 return -errno;
2386         }
2387
2388         if (streq(fstype, "crypto_LUKS")) {
2389                 log_error("nspawn currently does not support LUKS disk images.");
2390                 return -ENOTSUP;
2391         }
2392
2393         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2394                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2395
2396         return 0;
2397 #else
2398         log_error("--image= is not supported, compiled without blkid support.");
2399         return -ENOTSUP;
2400 #endif
2401 }
2402
2403 static int mount_devices(
2404                 const char *where,
2405                 const char *root_device, bool root_device_rw,
2406                 const char *home_device, bool home_device_rw,
2407                 const char *srv_device, bool srv_device_rw) {
2408         int r;
2409
2410         assert(where);
2411
2412         if (root_device) {
2413                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2414                 if (r < 0)
2415                         return log_error_errno(r, "Failed to mount root directory: %m");
2416         }
2417
2418         if (home_device) {
2419                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2420                 if (r < 0)
2421                         return log_error_errno(r, "Failed to mount home directory: %m");
2422         }
2423
2424         if (srv_device) {
2425                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2426                 if (r < 0)
2427                         return log_error_errno(r, "Failed to mount server data directory: %m");
2428         }
2429
2430         return 0;
2431 }
2432
2433 static void loop_remove(int nr, int *image_fd) {
2434         _cleanup_close_ int control = -1;
2435         int r;
2436
2437         if (nr < 0)
2438                 return;
2439
2440         if (image_fd && *image_fd >= 0) {
2441                 r = ioctl(*image_fd, LOOP_CLR_FD);
2442                 if (r < 0)
2443                         log_warning_errno(errno, "Failed to close loop image: %m");
2444                 *image_fd = safe_close(*image_fd);
2445         }
2446
2447         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2448         if (control < 0) {
2449                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2450                 return;
2451         }
2452
2453         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2454         if (r < 0)
2455                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2456 }
2457
2458 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2459         int pipe_fds[2];
2460         pid_t pid;
2461
2462         assert(database);
2463         assert(key);
2464         assert(rpid);
2465
2466         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2467                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2468
2469         pid = fork();
2470         if (pid < 0)
2471                 return log_error_errno(errno, "Failed to fork getent child: %m");
2472         else if (pid == 0) {
2473                 int nullfd;
2474                 char *empty_env = NULL;
2475
2476                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2477                         _exit(EXIT_FAILURE);
2478
2479                 if (pipe_fds[0] > 2)
2480                         safe_close(pipe_fds[0]);
2481                 if (pipe_fds[1] > 2)
2482                         safe_close(pipe_fds[1]);
2483
2484                 nullfd = open("/dev/null", O_RDWR);
2485                 if (nullfd < 0)
2486                         _exit(EXIT_FAILURE);
2487
2488                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2489                         _exit(EXIT_FAILURE);
2490
2491                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2492                         _exit(EXIT_FAILURE);
2493
2494                 if (nullfd > 2)
2495                         safe_close(nullfd);
2496
2497                 reset_all_signal_handlers();
2498                 close_all_fds(NULL, 0);
2499
2500                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2501                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2502                 _exit(EXIT_FAILURE);
2503         }
2504
2505         pipe_fds[1] = safe_close(pipe_fds[1]);
2506
2507         *rpid = pid;
2508
2509         return pipe_fds[0];
2510 }
2511
2512 static int change_uid_gid(char **_home) {
2513         char line[LINE_MAX], *x, *u, *g, *h;
2514         const char *word, *state;
2515         _cleanup_free_ uid_t *uids = NULL;
2516         _cleanup_free_ char *home = NULL;
2517         _cleanup_fclose_ FILE *f = NULL;
2518         _cleanup_close_ int fd = -1;
2519         unsigned n_uids = 0;
2520         size_t sz = 0, l;
2521         uid_t uid;
2522         gid_t gid;
2523         pid_t pid;
2524         int r;
2525
2526         assert(_home);
2527
2528         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2529                 /* Reset everything fully to 0, just in case */
2530
2531                 if (setgroups(0, NULL) < 0)
2532                         return log_error_errno(errno, "setgroups() failed: %m");
2533
2534                 if (setresgid(0, 0, 0) < 0)
2535                         return log_error_errno(errno, "setregid() failed: %m");
2536
2537                 if (setresuid(0, 0, 0) < 0)
2538                         return log_error_errno(errno, "setreuid() failed: %m");
2539
2540                 *_home = NULL;
2541                 return 0;
2542         }
2543
2544         /* First, get user credentials */
2545         fd = spawn_getent("passwd", arg_user, &pid);
2546         if (fd < 0)
2547                 return fd;
2548
2549         f = fdopen(fd, "r");
2550         if (!f)
2551                 return log_oom();
2552         fd = -1;
2553
2554         if (!fgets(line, sizeof(line), f)) {
2555
2556                 if (!ferror(f)) {
2557                         log_error("Failed to resolve user %s.", arg_user);
2558                         return -ESRCH;
2559                 }
2560
2561                 log_error_errno(errno, "Failed to read from getent: %m");
2562                 return -errno;
2563         }
2564
2565         truncate_nl(line);
2566
2567         wait_for_terminate_and_warn("getent passwd", pid, true);
2568
2569         x = strchr(line, ':');
2570         if (!x) {
2571                 log_error("/etc/passwd entry has invalid user field.");
2572                 return -EIO;
2573         }
2574
2575         u = strchr(x+1, ':');
2576         if (!u) {
2577                 log_error("/etc/passwd entry has invalid password field.");
2578                 return -EIO;
2579         }
2580
2581         u++;
2582         g = strchr(u, ':');
2583         if (!g) {
2584                 log_error("/etc/passwd entry has invalid UID field.");
2585                 return -EIO;
2586         }
2587
2588         *g = 0;
2589         g++;
2590         x = strchr(g, ':');
2591         if (!x) {
2592                 log_error("/etc/passwd entry has invalid GID field.");
2593                 return -EIO;
2594         }
2595
2596         *x = 0;
2597         h = strchr(x+1, ':');
2598         if (!h) {
2599                 log_error("/etc/passwd entry has invalid GECOS field.");
2600                 return -EIO;
2601         }
2602
2603         h++;
2604         x = strchr(h, ':');
2605         if (!x) {
2606                 log_error("/etc/passwd entry has invalid home directory field.");
2607                 return -EIO;
2608         }
2609
2610         *x = 0;
2611
2612         r = parse_uid(u, &uid);
2613         if (r < 0) {
2614                 log_error("Failed to parse UID of user.");
2615                 return -EIO;
2616         }
2617
2618         r = parse_gid(g, &gid);
2619         if (r < 0) {
2620                 log_error("Failed to parse GID of user.");
2621                 return -EIO;
2622         }
2623
2624         home = strdup(h);
2625         if (!home)
2626                 return log_oom();
2627
2628         /* Second, get group memberships */
2629         fd = spawn_getent("initgroups", arg_user, &pid);
2630         if (fd < 0)
2631                 return fd;
2632
2633         fclose(f);
2634         f = fdopen(fd, "r");
2635         if (!f)
2636                 return log_oom();
2637         fd = -1;
2638
2639         if (!fgets(line, sizeof(line), f)) {
2640                 if (!ferror(f)) {
2641                         log_error("Failed to resolve user %s.", arg_user);
2642                         return -ESRCH;
2643                 }
2644
2645                 log_error_errno(errno, "Failed to read from getent: %m");
2646                 return -errno;
2647         }
2648
2649         truncate_nl(line);
2650
2651         wait_for_terminate_and_warn("getent initgroups", pid, true);
2652
2653         /* Skip over the username and subsequent separator whitespace */
2654         x = line;
2655         x += strcspn(x, WHITESPACE);
2656         x += strspn(x, WHITESPACE);
2657
2658         FOREACH_WORD(word, l, x, state) {
2659                 char c[l+1];
2660
2661                 memcpy(c, word, l);
2662                 c[l] = 0;
2663
2664                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2665                         return log_oom();
2666
2667                 r = parse_uid(c, &uids[n_uids++]);
2668                 if (r < 0) {
2669                         log_error("Failed to parse group data from getent.");
2670                         return -EIO;
2671                 }
2672         }
2673
2674         r = mkdir_parents(home, 0775);
2675         if (r < 0)
2676                 return log_error_errno(r, "Failed to make home root directory: %m");
2677
2678         r = mkdir_safe(home, 0755, uid, gid);
2679         if (r < 0 && r != -EEXIST)
2680                 return log_error_errno(r, "Failed to make home directory: %m");
2681
2682         fchown(STDIN_FILENO, uid, gid);
2683         fchown(STDOUT_FILENO, uid, gid);
2684         fchown(STDERR_FILENO, uid, gid);
2685
2686         if (setgroups(n_uids, uids) < 0)
2687                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2688
2689         if (setresgid(gid, gid, gid) < 0)
2690                 return log_error_errno(errno, "setregid() failed: %m");
2691
2692         if (setresuid(uid, uid, uid) < 0)
2693                 return log_error_errno(errno, "setreuid() failed: %m");
2694
2695         if (_home) {
2696                 *_home = home;
2697                 home = NULL;
2698         }
2699
2700         return 0;
2701 }
2702
2703 /*
2704  * Return values:
2705  * < 0 : wait_for_terminate() failed to get the state of the
2706  *       container, the container was terminated by a signal, or
2707  *       failed for an unknown reason.  No change is made to the
2708  *       container argument.
2709  * > 0 : The program executed in the container terminated with an
2710  *       error.  The exit code of the program executed in the
2711  *       container is returned.  The container argument has been set
2712  *       to CONTAINER_TERMINATED.
2713  *   0 : The container is being rebooted, has been shut down or exited
2714  *       successfully.  The container argument has been set to either
2715  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2716  *
2717  * That is, success is indicated by a return value of zero, and an
2718  * error is indicated by a non-zero value.
2719  */
2720 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2721         siginfo_t status;
2722         int r;
2723
2724         r = wait_for_terminate(pid, &status);
2725         if (r < 0)
2726                 return log_warning_errno(r, "Failed to wait for container: %m");
2727
2728         switch (status.si_code) {
2729
2730         case CLD_EXITED:
2731                 if (status.si_status == 0) {
2732                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2733
2734                 } else
2735                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2736
2737                 *container = CONTAINER_TERMINATED;
2738                 return status.si_status;
2739
2740         case CLD_KILLED:
2741                 if (status.si_status == SIGINT) {
2742
2743                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2744                         *container = CONTAINER_TERMINATED;
2745                         return 0;
2746
2747                 } else if (status.si_status == SIGHUP) {
2748
2749                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2750                         *container = CONTAINER_REBOOTED;
2751                         return 0;
2752                 }
2753
2754                 /* CLD_KILLED fallthrough */
2755
2756         case CLD_DUMPED:
2757                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2758                 return -EIO;
2759
2760         default:
2761                 log_error("Container %s failed due to unknown reason.", arg_machine);
2762                 return -EIO;
2763         }
2764
2765         return r;
2766 }
2767
2768 static void nop_handler(int sig) {}
2769
2770 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2771         pid_t pid;
2772
2773         pid = PTR_TO_UINT32(userdata);
2774         if (pid > 0) {
2775                 if (kill(pid, SIGRTMIN+3) >= 0) {
2776                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2777                         sd_event_source_set_userdata(s, NULL);
2778                         return 0;
2779                 }
2780         }
2781
2782         sd_event_exit(sd_event_source_get_event(s), 0);
2783         return 0;
2784 }
2785
2786 int main(int argc, char *argv[]) {
2787
2788         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2789         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2790         _cleanup_close_ int master = -1, image_fd = -1;
2791         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2792         _cleanup_fdset_free_ FDSet *fds = NULL;
2793         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2794         const char *console = NULL;
2795         char veth_name[IFNAMSIZ];
2796         bool secondary = false;
2797         sigset_t mask, mask_chld;
2798         pid_t pid = 0;
2799
2800         log_parse_environment();
2801         log_open();
2802
2803         k = parse_argv(argc, argv);
2804         if (k < 0)
2805                 goto finish;
2806         else if (k == 0) {
2807                 r = EXIT_SUCCESS;
2808                 goto finish;
2809         }
2810
2811         if (!arg_image) {
2812                 if (arg_directory) {
2813                         char *p;
2814
2815                         p = path_make_absolute_cwd(arg_directory);
2816                         free(arg_directory);
2817                         arg_directory = p;
2818                 } else
2819                         arg_directory = get_current_dir_name();
2820
2821                 if (!arg_directory) {
2822                         log_error("Failed to determine path, please use -D.");
2823                         goto finish;
2824                 }
2825                 path_kill_slashes(arg_directory);
2826         }
2827
2828         if (!arg_machine) {
2829                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2830                 if (!arg_machine) {
2831                         log_oom();
2832                         goto finish;
2833                 }
2834
2835                 hostname_cleanup(arg_machine, false);
2836                 if (isempty(arg_machine)) {
2837                         log_error("Failed to determine machine name automatically, please use -M.");
2838                         goto finish;
2839                 }
2840         }
2841
2842         if (geteuid() != 0) {
2843                 log_error("Need to be root.");
2844                 goto finish;
2845         }
2846
2847         if (sd_booted() <= 0) {
2848                 log_error("Not running on a systemd system.");
2849                 goto finish;
2850         }
2851
2852         log_close();
2853         n_fd_passed = sd_listen_fds(false);
2854         if (n_fd_passed > 0) {
2855                 k = fdset_new_listen_fds(&fds, false);
2856                 if (k < 0) {
2857                         log_error_errno(k, "Failed to collect file descriptors: %m");
2858                         goto finish;
2859                 }
2860         }
2861         fdset_close_others(fds);
2862         log_open();
2863
2864         if (arg_directory) {
2865                 if (path_equal(arg_directory, "/")) {
2866                         log_error("Spawning container on root directory not supported.");
2867                         goto finish;
2868                 }
2869
2870                 if (arg_boot) {
2871                         if (path_is_os_tree(arg_directory) <= 0) {
2872                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2873                                 goto finish;
2874                         }
2875                 } else {
2876                         const char *p;
2877
2878                         p = strappenda(arg_directory,
2879                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2880                         if (access(p, F_OK) < 0) {
2881                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2882                                 goto finish;
2883
2884                         }
2885                 }
2886         } else {
2887                 char template[] = "/tmp/nspawn-root-XXXXXX";
2888
2889                 if (!mkdtemp(template)) {
2890                         log_error_errno(errno, "Failed to create temporary directory: %m");
2891                         r = -errno;
2892                         goto finish;
2893                 }
2894
2895                 arg_directory = strdup(template);
2896                 if (!arg_directory) {
2897                         r = log_oom();
2898                         goto finish;
2899                 }
2900
2901                 image_fd = setup_image(&device_path, &loop_nr);
2902                 if (image_fd < 0) {
2903                         r = image_fd;
2904                         goto finish;
2905                 }
2906
2907                 r = dissect_image(image_fd,
2908                                   &root_device, &root_device_rw,
2909                                   &home_device, &home_device_rw,
2910                                   &srv_device, &srv_device_rw,
2911                                   &secondary);
2912                 if (r < 0)
2913                         goto finish;
2914         }
2915
2916         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2917         if (master < 0) {
2918                 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
2919                 goto finish;
2920         }
2921
2922         console = ptsname(master);
2923         if (!console) {
2924                 log_error_errno(errno, "Failed to determine tty name: %m");
2925                 goto finish;
2926         }
2927
2928         if (!arg_quiet)
2929                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2930                          arg_machine, arg_image ? arg_image : arg_directory);
2931
2932         if (unlockpt(master) < 0) {
2933                 log_error_errno(errno, "Failed to unlock tty: %m");
2934                 goto finish;
2935         }
2936
2937         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2938                 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
2939                 goto finish;
2940         }
2941
2942         sd_notify(false,
2943                   "READY=1\n"
2944                   "STATUS=Container running.");
2945
2946         assert_se(sigemptyset(&mask) == 0);
2947         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2948         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2949
2950         assert_se(sigemptyset(&mask_chld) == 0);
2951         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2952
2953         for (;;) {
2954                 ContainerStatus container_status;
2955                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
2956                 struct sigaction sa = {
2957                         .sa_handler = nop_handler,
2958                         .sa_flags = SA_NOCLDSTOP,
2959                 };
2960
2961                 r = barrier_create(&barrier);
2962                 if (r < 0) {
2963                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
2964                         goto finish;
2965                 }
2966
2967                 /* Child can be killed before execv(), so handle SIGCHLD
2968                  * in order to interrupt parent's blocking calls and
2969                  * give it a chance to call wait() and terminate. */
2970                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2971                 if (r < 0) {
2972                         log_error_errno(errno, "Failed to change the signal mask: %m");
2973                         goto finish;
2974                 }
2975
2976                 r = sigaction(SIGCHLD, &sa, NULL);
2977                 if (r < 0) {
2978                         log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
2979                         goto finish;
2980                 }
2981
2982                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2983                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2984                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
2985                 if (pid < 0) {
2986                         if (errno == EINVAL)
2987                                 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2988                         else
2989                                 log_error_errno(errno, "clone() failed: %m");
2990
2991                         r = pid;
2992                         goto finish;
2993                 }
2994
2995                 if (pid == 0) {
2996                         /* child */
2997                         _cleanup_free_ char *home = NULL;
2998                         unsigned n_env = 2;
2999                         const char *envp[] = {
3000                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3001                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3002                                 NULL, /* TERM */
3003                                 NULL, /* HOME */
3004                                 NULL, /* USER */
3005                                 NULL, /* LOGNAME */
3006                                 NULL, /* container_uuid */
3007                                 NULL, /* LISTEN_FDS */
3008                                 NULL, /* LISTEN_PID */
3009                                 NULL
3010                         };
3011                         char **env_use;
3012
3013                         barrier_set_role(&barrier, BARRIER_CHILD);
3014
3015                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3016                         if (envp[n_env])
3017                                 n_env ++;
3018
3019                         master = safe_close(master);
3020
3021                         close_nointr(STDIN_FILENO);
3022                         close_nointr(STDOUT_FILENO);
3023                         close_nointr(STDERR_FILENO);
3024
3025                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3026
3027                         reset_all_signal_handlers();
3028                         reset_signal_mask();
3029
3030                         k = open_terminal(console, O_RDWR);
3031                         if (k != STDIN_FILENO) {
3032                                 if (k >= 0) {
3033                                         safe_close(k);
3034                                         k = -EINVAL;
3035                                 }
3036
3037                                 log_error_errno(k, "Failed to open console: %m");
3038                                 _exit(EXIT_FAILURE);
3039                         }
3040
3041                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3042                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3043                                 log_error_errno(errno, "Failed to duplicate console: %m");
3044                                 _exit(EXIT_FAILURE);
3045                         }
3046
3047                         if (setsid() < 0) {
3048                                 log_error_errno(errno, "setsid() failed: %m");
3049                                 _exit(EXIT_FAILURE);
3050                         }
3051
3052                         if (reset_audit_loginuid() < 0)
3053                                 _exit(EXIT_FAILURE);
3054
3055                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3056                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3057                                 _exit(EXIT_FAILURE);
3058                         }
3059
3060                         /* Mark everything as slave, so that we still
3061                          * receive mounts from the real root, but don't
3062                          * propagate mounts to the real root. */
3063                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3064                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3065                                 _exit(EXIT_FAILURE);
3066                         }
3067
3068                         if (mount_devices(arg_directory,
3069                                           root_device, root_device_rw,
3070                                           home_device, home_device_rw,
3071                                           srv_device, srv_device_rw) < 0)
3072                                 _exit(EXIT_FAILURE);
3073
3074                         /* Turn directory into bind mount */
3075                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3076                                 log_error_errno(errno, "Failed to make bind mount: %m");
3077                                 _exit(EXIT_FAILURE);
3078                         }
3079
3080                         r = setup_volatile(arg_directory);
3081                         if (r < 0)
3082                                 _exit(EXIT_FAILURE);
3083
3084                         if (setup_volatile_state(arg_directory) < 0)
3085                                 _exit(EXIT_FAILURE);
3086
3087                         r = base_filesystem_create(arg_directory);
3088                         if (r < 0)
3089                                 _exit(EXIT_FAILURE);
3090
3091                         if (arg_read_only) {
3092                                 k = bind_remount_recursive(arg_directory, true);
3093                                 if (k < 0) {
3094                                         log_error_errno(k, "Failed to make tree read-only: %m");
3095                                         _exit(EXIT_FAILURE);
3096                                 }
3097                         }
3098
3099                         if (mount_all(arg_directory) < 0)
3100                                 _exit(EXIT_FAILURE);
3101
3102                         if (copy_devnodes(arg_directory) < 0)
3103                                 _exit(EXIT_FAILURE);
3104
3105                         if (setup_ptmx(arg_directory) < 0)
3106                                 _exit(EXIT_FAILURE);
3107
3108                         dev_setup(arg_directory);
3109
3110                         if (setup_seccomp() < 0)
3111                                 _exit(EXIT_FAILURE);
3112
3113                         if (setup_dev_console(arg_directory, console) < 0)
3114                                 _exit(EXIT_FAILURE);
3115
3116                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3117                                 _exit(EXIT_FAILURE);
3118
3119                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3120
3121                         if (setup_boot_id(arg_directory) < 0)
3122                                 _exit(EXIT_FAILURE);
3123
3124                         if (setup_timezone(arg_directory) < 0)
3125                                 _exit(EXIT_FAILURE);
3126
3127                         if (setup_resolv_conf(arg_directory) < 0)
3128                                 _exit(EXIT_FAILURE);
3129
3130                         if (setup_journal(arg_directory) < 0)
3131                                 _exit(EXIT_FAILURE);
3132
3133                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3134                                 _exit(EXIT_FAILURE);
3135
3136                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3137                                 _exit(EXIT_FAILURE);
3138
3139                         if (mount_tmpfs(arg_directory) < 0)
3140                                 _exit(EXIT_FAILURE);
3141
3142                         /* Tell the parent that we are ready, and that
3143                          * it can cgroupify us to that we lack access
3144                          * to certain devices and resources. */
3145                         (void)barrier_place(&barrier);
3146
3147                         if (chdir(arg_directory) < 0) {
3148                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3149                                 _exit(EXIT_FAILURE);
3150                         }
3151
3152                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3153                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3154                                 _exit(EXIT_FAILURE);
3155                         }
3156
3157                         if (chroot(".") < 0) {
3158                                 log_error_errno(errno, "chroot() failed: %m");
3159                                 _exit(EXIT_FAILURE);
3160                         }
3161
3162                         if (chdir("/") < 0) {
3163                                 log_error_errno(errno, "chdir() failed: %m");
3164                                 _exit(EXIT_FAILURE);
3165                         }
3166
3167                         umask(0022);
3168
3169                         if (arg_private_network)
3170                                 loopback_setup();
3171
3172                         if (drop_capabilities() < 0) {
3173                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3174                                 _exit(EXIT_FAILURE);
3175                         }
3176
3177                         r = change_uid_gid(&home);
3178                         if (r < 0)
3179                                 _exit(EXIT_FAILURE);
3180
3181                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3182                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3183                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3184                                 log_oom();
3185                                 _exit(EXIT_FAILURE);
3186                         }
3187
3188                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3189                                 char as_uuid[37];
3190
3191                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3192                                         log_oom();
3193                                         _exit(EXIT_FAILURE);
3194                                 }
3195                         }
3196
3197                         if (fdset_size(fds) > 0) {
3198                                 k = fdset_cloexec(fds, false);
3199                                 if (k < 0) {
3200                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3201                                         _exit(EXIT_FAILURE);
3202                                 }
3203
3204                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3205                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3206                                         log_oom();
3207                                         _exit(EXIT_FAILURE);
3208                                 }
3209                         }
3210
3211                         setup_hostname();
3212
3213                         if (arg_personality != 0xffffffffLU) {
3214                                 if (personality(arg_personality) < 0) {
3215                                         log_error_errno(errno, "personality() failed: %m");
3216                                         _exit(EXIT_FAILURE);
3217                                 }
3218                         } else if (secondary) {
3219                                 if (personality(PER_LINUX32) < 0) {
3220                                         log_error_errno(errno, "personality() failed: %m");
3221                                         _exit(EXIT_FAILURE);
3222                                 }
3223                         }
3224
3225 #ifdef HAVE_SELINUX
3226                         if (arg_selinux_context)
3227                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3228                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3229                                         _exit(EXIT_FAILURE);
3230                                 }
3231 #endif
3232
3233                         if (!strv_isempty(arg_setenv)) {
3234                                 char **n;
3235
3236                                 n = strv_env_merge(2, envp, arg_setenv);
3237                                 if (!n) {
3238                                         log_oom();
3239                                         _exit(EXIT_FAILURE);
3240                                 }
3241
3242                                 env_use = n;
3243                         } else
3244                                 env_use = (char**) envp;
3245
3246                         /* Wait until the parent is ready with the setup, too... */
3247                         if (!barrier_place_and_sync(&barrier))
3248                                 _exit(EXIT_FAILURE);
3249
3250                         if (arg_boot) {
3251                                 char **a;
3252                                 size_t l;
3253
3254                                 /* Automatically search for the init system */
3255
3256                                 l = 1 + argc - optind;
3257                                 a = newa(char*, l + 1);
3258                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3259
3260                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3261                                 execve(a[0], a, env_use);
3262
3263                                 a[0] = (char*) "/lib/systemd/systemd";
3264                                 execve(a[0], a, env_use);
3265
3266                                 a[0] = (char*) "/sbin/init";
3267                                 execve(a[0], a, env_use);
3268                         } else if (argc > optind)
3269                                 execvpe(argv[optind], argv + optind, env_use);
3270                         else {
3271                                 chdir(home ? home : "/root");
3272                                 execle("/bin/bash", "-bash", NULL, env_use);
3273                                 execle("/bin/sh", "-sh", NULL, env_use);
3274                         }
3275
3276                         log_error_errno(errno, "execv() failed: %m");
3277                         _exit(EXIT_FAILURE);
3278                 }
3279
3280                 barrier_set_role(&barrier, BARRIER_PARENT);
3281                 fdset_free(fds);
3282                 fds = NULL;
3283
3284                 /* wait for child-setup to be done */
3285                 if (barrier_place_and_sync(&barrier)) {
3286                         _cleanup_event_unref_ sd_event *event = NULL;
3287                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3288                         int ifi = 0;
3289
3290                         r = move_network_interfaces(pid);
3291                         if (r < 0)
3292                                 goto finish;
3293
3294                         r = setup_veth(pid, veth_name, &ifi);
3295                         if (r < 0)
3296                                 goto finish;
3297
3298                         r = setup_bridge(veth_name, &ifi);
3299                         if (r < 0)
3300                                 goto finish;
3301
3302                         r = setup_macvlan(pid);
3303                         if (r < 0)
3304                                 goto finish;
3305
3306                         r = register_machine(pid, ifi);
3307                         if (r < 0)
3308                                 goto finish;
3309
3310                         /* Block SIGCHLD here, before notifying child.
3311                          * process_pty() will handle it with the other signals. */
3312                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3313                         if (r < 0)
3314                                 goto finish;
3315
3316                         /* Reset signal to default */
3317                         r = default_signals(SIGCHLD, -1);
3318                         if (r < 0)
3319                                 goto finish;
3320
3321                         /* Notify the child that the parent is ready with all
3322                          * its setup, and that the child can now hand over
3323                          * control to the code to run inside the container. */
3324                         (void)barrier_place(&barrier);
3325
3326                         r = sd_event_new(&event);
3327                         if (r < 0) {
3328                                 log_error_errno(r, "Failed to get default event source: %m");
3329                                 goto finish;
3330                         }
3331
3332                         if (arg_boot) {
3333                                 /* Try to kill the init system on SIGINT or SIGTERM */
3334                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3335                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3336                         } else {
3337                                 /* Immediately exit */
3338                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3339                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3340                         }
3341
3342                         /* simply exit on sigchld */
3343                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3344
3345                         r = pty_forward_new(event, master, &forward);
3346                         if (r < 0) {
3347                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3348                                 goto finish;
3349                         }
3350
3351                         r = sd_event_loop(event);
3352                         if (r < 0)
3353                                 return log_error_errno(r, "Failed to run event loop: %m");
3354
3355                         forward = pty_forward_free(forward);
3356
3357                         if (!arg_quiet)
3358                                 putc('\n', stdout);
3359
3360                         /* Kill if it is not dead yet anyway */
3361                         terminate_machine(pid);
3362                 }
3363
3364                 /* Normally redundant, but better safe than sorry */
3365                 kill(pid, SIGKILL);
3366
3367                 r = wait_for_container(pid, &container_status);
3368                 pid = 0;
3369
3370                 if (r < 0) {
3371                         /* We failed to wait for the container, or the
3372                          * container exited abnormally */
3373                         r = EXIT_FAILURE;
3374                         break;
3375                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3376                         /* The container exited with a non-zero
3377                          * status, or with zero status and no reboot
3378                          * was requested. */
3379                         break;
3380
3381                 /* CONTAINER_REBOOTED, loop again */
3382
3383                 if (arg_keep_unit) {
3384                         /* Special handling if we are running as a
3385                          * service: instead of simply restarting the
3386                          * machine we want to restart the entire
3387                          * service, so let's inform systemd about this
3388                          * with the special exit code 133. The service
3389                          * file uses RestartForceExitStatus=133 so
3390                          * that this results in a full nspawn
3391                          * restart. This is necessary since we might
3392                          * have cgroup parameters set we want to have
3393                          * flushed out. */
3394                         r = 133;
3395                         break;
3396                 }
3397         }
3398
3399 finish:
3400         sd_notify(false,
3401                   "STOPPING=1\n"
3402                   "STATUS=Terminating...");
3403
3404         loop_remove(loop_nr, &image_fd);
3405
3406         if (pid > 0)
3407                 kill(pid, SIGKILL);
3408
3409         free(arg_directory);
3410         free(arg_machine);
3411         free(arg_user);
3412         strv_free(arg_setenv);
3413         strv_free(arg_network_interfaces);
3414         strv_free(arg_network_macvlan);
3415         strv_free(arg_bind);
3416         strv_free(arg_bind_ro);
3417         strv_free(arg_tmpfs);
3418
3419         return r;
3420 }