chiark / gitweb /
2bda27edf07472a5e36650b488c2b43afbc5054b
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #ifdef HAVE_BLKID
56 #include <blkid/blkid.h>
57 #endif
58
59 #include "sd-daemon.h"
60 #include "sd-bus.h"
61 #include "sd-id128.h"
62 #include "sd-rtnl.h"
63 #include "log.h"
64 #include "util.h"
65 #include "mkdir.h"
66 #include "macro.h"
67 #include "audit.h"
68 #include "missing.h"
69 #include "cgroup-util.h"
70 #include "strv.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
74 #include "fdset.h"
75 #include "build.h"
76 #include "fileio.h"
77 #include "bus-util.h"
78 #include "bus-error.h"
79 #include "ptyfwd.h"
80 #include "bus-kernel.h"
81 #include "env-util.h"
82 #include "def.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
86 #include "gpt.h"
87 #include "siphash24.h"
88 #include "copy.h"
89 #include "base-filesystem.h"
90 #include "barrier.h"
91 #include "event-util.h"
92 #include "capability.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
96 #include "list.h"
97 #include "in-addr-util.h"
98 #include "fw-util.h"
99 #include "local-addresses.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106         int protocol;
107         uint16_t host_port;
108         uint16_t container_port;
109         LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113         CONTAINER_TERMINATED,
114         CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118         LINK_NO,
119         LINK_AUTO,
120         LINK_HOST,
121         LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125         VOLATILE_NO,
126         VOLATILE_YES,
127         VOLATILE_STATE,
128 } Volatile;
129
130 static char *arg_directory = NULL;
131 static char *arg_template = NULL;
132 static char *arg_user = NULL;
133 static sd_id128_t arg_uuid = {};
134 static char *arg_machine = NULL;
135 static const char *arg_selinux_context = NULL;
136 static const char *arg_selinux_apifs_context = NULL;
137 static const char *arg_slice = NULL;
138 static bool arg_private_network = false;
139 static bool arg_read_only = false;
140 static bool arg_boot = false;
141 static bool arg_ephemeral = false;
142 static LinkJournal arg_link_journal = LINK_AUTO;
143 static bool arg_link_journal_try = false;
144 static uint64_t arg_retain =
145         (1ULL << CAP_CHOWN) |
146         (1ULL << CAP_DAC_OVERRIDE) |
147         (1ULL << CAP_DAC_READ_SEARCH) |
148         (1ULL << CAP_FOWNER) |
149         (1ULL << CAP_FSETID) |
150         (1ULL << CAP_IPC_OWNER) |
151         (1ULL << CAP_KILL) |
152         (1ULL << CAP_LEASE) |
153         (1ULL << CAP_LINUX_IMMUTABLE) |
154         (1ULL << CAP_NET_BIND_SERVICE) |
155         (1ULL << CAP_NET_BROADCAST) |
156         (1ULL << CAP_NET_RAW) |
157         (1ULL << CAP_SETGID) |
158         (1ULL << CAP_SETFCAP) |
159         (1ULL << CAP_SETPCAP) |
160         (1ULL << CAP_SETUID) |
161         (1ULL << CAP_SYS_ADMIN) |
162         (1ULL << CAP_SYS_CHROOT) |
163         (1ULL << CAP_SYS_NICE) |
164         (1ULL << CAP_SYS_PTRACE) |
165         (1ULL << CAP_SYS_TTY_CONFIG) |
166         (1ULL << CAP_SYS_RESOURCE) |
167         (1ULL << CAP_SYS_BOOT) |
168         (1ULL << CAP_AUDIT_WRITE) |
169         (1ULL << CAP_AUDIT_CONTROL) |
170         (1ULL << CAP_MKNOD);
171 static char **arg_bind = NULL;
172 static char **arg_bind_ro = NULL;
173 static char **arg_tmpfs = NULL;
174 static char **arg_setenv = NULL;
175 static bool arg_quiet = false;
176 static bool arg_share_system = false;
177 static bool arg_register = true;
178 static bool arg_keep_unit = false;
179 static char **arg_network_interfaces = NULL;
180 static char **arg_network_macvlan = NULL;
181 static bool arg_network_veth = false;
182 static const char *arg_network_bridge = NULL;
183 static unsigned long arg_personality = 0xffffffffLU;
184 static char *arg_image = NULL;
185 static Volatile arg_volatile = VOLATILE_NO;
186 static ExposePort *arg_expose_ports = NULL;
187
188 static void help(void) {
189         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
190                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
191                "  -h --help                 Show this help\n"
192                "     --version              Print version string\n"
193                "  -q --quiet                Do not show status information\n"
194                "  -D --directory=PATH       Root directory for the container\n"
195                "     --template=PATH        Initialize root directory from template directory,\n"
196                "                            if missing\n"
197                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
198                "                            remove it after exit\n"
199                "  -i --image=PATH           File system device or disk image for the container\n"
200                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
201                "  -u --user=USER            Run the command under specified user or uid\n"
202                "  -M --machine=NAME         Set the machine name for the container\n"
203                "     --uuid=UUID            Set a specific machine UUID for the container\n"
204                "  -S --slice=SLICE          Place the container in the specified slice\n"
205                "     --private-network      Disable network in container\n"
206                "     --network-interface=INTERFACE\n"
207                "                            Assign an existing network interface to the\n"
208                "                            container\n"
209                "     --network-macvlan=INTERFACE\n"
210                "                            Create a macvlan network interface based on an\n"
211                "                            existing network interface to the container\n"
212                "  -n --network-veth         Add a virtual ethernet connection between host\n"
213                "                            and container\n"
214                "     --network-bridge=INTERFACE\n"
215                "                            Add a virtual ethernet connection between host\n"
216                "                            and container and add it to an existing bridge on\n"
217                "                            the host\n"
218                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
219                "                            Expose a container IP port on the host\n"
220                "  -Z --selinux-context=SECLABEL\n"
221                "                            Set the SELinux security context to be used by\n"
222                "                            processes in the container\n"
223                "  -L --selinux-apifs-context=SECLABEL\n"
224                "                            Set the SELinux security context to be used by\n"
225                "                            API/tmpfs file systems in the container\n"
226                "     --capability=CAP       In addition to the default, retain specified\n"
227                "                            capability\n"
228                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
229                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
230                "                            try-guest, try-host\n"
231                "  -j                        Equivalent to --link-journal=try-guest\n"
232                "     --read-only            Mount the root directory read-only\n"
233                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
234                "                            the container\n"
235                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
236                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
237                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
238                "     --share-system         Share system namespaces with host\n"
239                "     --register=BOOLEAN     Register container as machine\n"
240                "     --keep-unit            Do not register a scope for the machine, reuse\n"
241                "                            the service unit nspawn is running in\n"
242                "     --volatile[=MODE]      Run the system in volatile mode\n"
243                , program_invocation_short_name);
244 }
245
246 static int set_sanitized_path(char **b, const char *path) {
247         char *p;
248
249         assert(b);
250         assert(path);
251
252         p = canonicalize_file_name(path);
253         if (!p) {
254                 if (errno != ENOENT)
255                         return -errno;
256
257                 p = path_make_absolute_cwd(path);
258                 if (!p)
259                         return -ENOMEM;
260         }
261
262         free(*b);
263         *b = path_kill_slashes(p);
264         return 0;
265 }
266
267 static int parse_argv(int argc, char *argv[]) {
268
269         enum {
270                 ARG_VERSION = 0x100,
271                 ARG_PRIVATE_NETWORK,
272                 ARG_UUID,
273                 ARG_READ_ONLY,
274                 ARG_CAPABILITY,
275                 ARG_DROP_CAPABILITY,
276                 ARG_LINK_JOURNAL,
277                 ARG_BIND,
278                 ARG_BIND_RO,
279                 ARG_TMPFS,
280                 ARG_SETENV,
281                 ARG_SHARE_SYSTEM,
282                 ARG_REGISTER,
283                 ARG_KEEP_UNIT,
284                 ARG_NETWORK_INTERFACE,
285                 ARG_NETWORK_MACVLAN,
286                 ARG_NETWORK_BRIDGE,
287                 ARG_PERSONALITY,
288                 ARG_VOLATILE,
289                 ARG_TEMPLATE,
290         };
291
292         static const struct option options[] = {
293                 { "help",                  no_argument,       NULL, 'h'                   },
294                 { "version",               no_argument,       NULL, ARG_VERSION           },
295                 { "directory",             required_argument, NULL, 'D'                   },
296                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
297                 { "ephemeral",             no_argument,       NULL, 'x'                   },
298                 { "user",                  required_argument, NULL, 'u'                   },
299                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
300                 { "boot",                  no_argument,       NULL, 'b'                   },
301                 { "uuid",                  required_argument, NULL, ARG_UUID              },
302                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
303                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
304                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
305                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
306                 { "bind",                  required_argument, NULL, ARG_BIND              },
307                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
308                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
309                 { "machine",               required_argument, NULL, 'M'                   },
310                 { "slice",                 required_argument, NULL, 'S'                   },
311                 { "setenv",                required_argument, NULL, ARG_SETENV            },
312                 { "selinux-context",       required_argument, NULL, 'Z'                   },
313                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
314                 { "quiet",                 no_argument,       NULL, 'q'                   },
315                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
316                 { "register",              required_argument, NULL, ARG_REGISTER          },
317                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
318                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
319                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
320                 { "network-veth",          no_argument,       NULL, 'n'                   },
321                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
322                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
323                 { "image",                 required_argument, NULL, 'i'                   },
324                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
325                 { "port",                  required_argument, NULL, 'p'                   },
326                 {}
327         };
328
329         int c, r;
330         uint64_t plus = 0, minus = 0;
331
332         assert(argc >= 0);
333         assert(argv);
334
335         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
336
337                 switch (c) {
338
339                 case 'h':
340                         help();
341                         return 0;
342
343                 case ARG_VERSION:
344                         puts(PACKAGE_STRING);
345                         puts(SYSTEMD_FEATURES);
346                         return 0;
347
348                 case 'D':
349                         r = set_sanitized_path(&arg_directory, optarg);
350                         if (r < 0)
351                                 return log_error_errno(r, "Invalid root directory: %m");
352
353                         break;
354
355                 case ARG_TEMPLATE:
356                         r = set_sanitized_path(&arg_template, optarg);
357                         if (r < 0)
358                                 return log_error_errno(r, "Invalid template directory: %m");
359
360                         break;
361
362                 case 'i':
363                         r = set_sanitized_path(&arg_image, optarg);
364                         if (r < 0)
365                                 return log_error_errno(r, "Invalid image path: %m");
366
367                         break;
368
369                 case 'x':
370                         arg_ephemeral = true;
371                         break;
372
373                 case 'u':
374                         free(arg_user);
375                         arg_user = strdup(optarg);
376                         if (!arg_user)
377                                 return log_oom();
378
379                         break;
380
381                 case ARG_NETWORK_BRIDGE:
382                         arg_network_bridge = optarg;
383
384                         /* fall through */
385
386                 case 'n':
387                         arg_network_veth = true;
388                         arg_private_network = true;
389                         break;
390
391                 case ARG_NETWORK_INTERFACE:
392                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
393                                 return log_oom();
394
395                         arg_private_network = true;
396                         break;
397
398                 case ARG_NETWORK_MACVLAN:
399                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
400                                 return log_oom();
401
402                         /* fall through */
403
404                 case ARG_PRIVATE_NETWORK:
405                         arg_private_network = true;
406                         break;
407
408                 case 'b':
409                         arg_boot = true;
410                         break;
411
412                 case ARG_UUID:
413                         r = sd_id128_from_string(optarg, &arg_uuid);
414                         if (r < 0) {
415                                 log_error("Invalid UUID: %s", optarg);
416                                 return r;
417                         }
418                         break;
419
420                 case 'S':
421                         arg_slice = optarg;
422                         break;
423
424                 case 'M':
425                         if (isempty(optarg)) {
426                                 free(arg_machine);
427                                 arg_machine = NULL;
428                         } else {
429                                 if (!machine_name_is_valid(optarg)) {
430                                         log_error("Invalid machine name: %s", optarg);
431                                         return -EINVAL;
432                                 }
433
434                                 r = free_and_strdup(&arg_machine, optarg);
435                                 if (r < 0)
436                                         return log_oom();
437
438                                 break;
439                         }
440
441                 case 'Z':
442                         arg_selinux_context = optarg;
443                         break;
444
445                 case 'L':
446                         arg_selinux_apifs_context = optarg;
447                         break;
448
449                 case ARG_READ_ONLY:
450                         arg_read_only = true;
451                         break;
452
453                 case ARG_CAPABILITY:
454                 case ARG_DROP_CAPABILITY: {
455                         const char *state, *word;
456                         size_t length;
457
458                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
459                                 _cleanup_free_ char *t;
460
461                                 t = strndup(word, length);
462                                 if (!t)
463                                         return log_oom();
464
465                                 if (streq(t, "all")) {
466                                         if (c == ARG_CAPABILITY)
467                                                 plus = (uint64_t) -1;
468                                         else
469                                                 minus = (uint64_t) -1;
470                                 } else {
471                                         int cap;
472
473                                         cap = capability_from_name(t);
474                                         if (cap < 0) {
475                                                 log_error("Failed to parse capability %s.", t);
476                                                 return -EINVAL;
477                                         }
478
479                                         if (c == ARG_CAPABILITY)
480                                                 plus |= 1ULL << (uint64_t) cap;
481                                         else
482                                                 minus |= 1ULL << (uint64_t) cap;
483                                 }
484                         }
485
486                         break;
487                 }
488
489                 case 'j':
490                         arg_link_journal = LINK_GUEST;
491                         arg_link_journal_try = true;
492                         break;
493
494                 case ARG_LINK_JOURNAL:
495                         if (streq(optarg, "auto")) {
496                                 arg_link_journal = LINK_AUTO;
497                                 arg_link_journal_try = false;
498                         } else if (streq(optarg, "no")) {
499                                 arg_link_journal = LINK_NO;
500                                 arg_link_journal_try = false;
501                         } else if (streq(optarg, "guest")) {
502                                 arg_link_journal = LINK_GUEST;
503                                 arg_link_journal_try = false;
504                         } else if (streq(optarg, "host")) {
505                                 arg_link_journal = LINK_HOST;
506                                 arg_link_journal_try = false;
507                         } else if (streq(optarg, "try-guest")) {
508                                 arg_link_journal = LINK_GUEST;
509                                 arg_link_journal_try = true;
510                         } else if (streq(optarg, "try-host")) {
511                                 arg_link_journal = LINK_HOST;
512                                 arg_link_journal_try = true;
513                         } else {
514                                 log_error("Failed to parse link journal mode %s", optarg);
515                                 return -EINVAL;
516                         }
517
518                         break;
519
520                 case ARG_BIND:
521                 case ARG_BIND_RO: {
522                         _cleanup_free_ char *a = NULL, *b = NULL;
523                         char *e;
524                         char ***x;
525
526                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
527
528                         e = strchr(optarg, ':');
529                         if (e) {
530                                 a = strndup(optarg, e - optarg);
531                                 b = strdup(e + 1);
532                         } else {
533                                 a = strdup(optarg);
534                                 b = strdup(optarg);
535                         }
536
537                         if (!a || !b)
538                                 return log_oom();
539
540                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
541                                 log_error("Invalid bind mount specification: %s", optarg);
542                                 return -EINVAL;
543                         }
544
545                         r = strv_extend(x, a);
546                         if (r < 0)
547                                 return log_oom();
548
549                         r = strv_extend(x, b);
550                         if (r < 0)
551                                 return log_oom();
552
553                         break;
554                 }
555
556                 case ARG_TMPFS: {
557                         _cleanup_free_ char *a = NULL, *b = NULL;
558                         char *e;
559
560                         e = strchr(optarg, ':');
561                         if (e) {
562                                 a = strndup(optarg, e - optarg);
563                                 b = strdup(e + 1);
564                         } else {
565                                 a = strdup(optarg);
566                                 b = strdup("mode=0755");
567                         }
568
569                         if (!a || !b)
570                                 return log_oom();
571
572                         if (!path_is_absolute(a)) {
573                                 log_error("Invalid tmpfs specification: %s", optarg);
574                                 return -EINVAL;
575                         }
576
577                         r = strv_push(&arg_tmpfs, a);
578                         if (r < 0)
579                                 return log_oom();
580
581                         a = NULL;
582
583                         r = strv_push(&arg_tmpfs, b);
584                         if (r < 0)
585                                 return log_oom();
586
587                         b = NULL;
588
589                         break;
590                 }
591
592                 case ARG_SETENV: {
593                         char **n;
594
595                         if (!env_assignment_is_valid(optarg)) {
596                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
597                                 return -EINVAL;
598                         }
599
600                         n = strv_env_set(arg_setenv, optarg);
601                         if (!n)
602                                 return log_oom();
603
604                         strv_free(arg_setenv);
605                         arg_setenv = n;
606                         break;
607                 }
608
609                 case 'q':
610                         arg_quiet = true;
611                         break;
612
613                 case ARG_SHARE_SYSTEM:
614                         arg_share_system = true;
615                         break;
616
617                 case ARG_REGISTER:
618                         r = parse_boolean(optarg);
619                         if (r < 0) {
620                                 log_error("Failed to parse --register= argument: %s", optarg);
621                                 return r;
622                         }
623
624                         arg_register = r;
625                         break;
626
627                 case ARG_KEEP_UNIT:
628                         arg_keep_unit = true;
629                         break;
630
631                 case ARG_PERSONALITY:
632
633                         arg_personality = personality_from_string(optarg);
634                         if (arg_personality == 0xffffffffLU) {
635                                 log_error("Unknown or unsupported personality '%s'.", optarg);
636                                 return -EINVAL;
637                         }
638
639                         break;
640
641                 case ARG_VOLATILE:
642
643                         if (!optarg)
644                                 arg_volatile = VOLATILE_YES;
645                         else {
646                                 r = parse_boolean(optarg);
647                                 if (r < 0) {
648                                         if (streq(optarg, "state"))
649                                                 arg_volatile = VOLATILE_STATE;
650                                         else {
651                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
652                                                 return r;
653                                         }
654                                 } else
655                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
656                         }
657
658                         break;
659
660                 case 'p': {
661                         const char *split, *e;
662                         uint16_t container_port, host_port;
663                         int protocol;
664                         ExposePort *p;
665
666                         if ((e = startswith(optarg, "tcp:")))
667                                 protocol = IPPROTO_TCP;
668                         else if ((e = startswith(optarg, "udp:")))
669                                 protocol = IPPROTO_UDP;
670                         else {
671                                 e = optarg;
672                                 protocol = IPPROTO_TCP;
673                         }
674
675                         split = strchr(e, ':');
676                         if (split) {
677                                 char v[split - e + 1];
678
679                                 memcpy(v, e, split - e);
680                                 v[split - e] = 0;
681
682                                 r = safe_atou16(v, &host_port);
683                                 if (r < 0 || host_port <= 0) {
684                                         log_error("Failed to parse host port: %s", optarg);
685                                         return -EINVAL;
686                                 }
687
688                                 r = safe_atou16(split + 1, &container_port);
689                         } else {
690                                 r = safe_atou16(e, &container_port);
691                                 host_port = container_port;
692                         }
693
694                         if (r < 0 || container_port <= 0) {
695                                 log_error("Failed to parse host port: %s", optarg);
696                                 return -EINVAL;
697                         }
698
699                         LIST_FOREACH(ports, p, arg_expose_ports) {
700                                 if (p->protocol == protocol && p->host_port == host_port) {
701                                         log_error("Duplicate port specification: %s", optarg);
702                                         return -EINVAL;
703                                 }
704                         }
705
706                         p = new(ExposePort, 1);
707                         if (!p)
708                                 return log_oom();
709
710                         p->protocol = protocol;
711                         p->host_port = host_port;
712                         p->container_port = container_port;
713
714                         LIST_PREPEND(ports, arg_expose_ports, p);
715
716                         break;
717                 }
718
719                 case '?':
720                         return -EINVAL;
721
722                 default:
723                         assert_not_reached("Unhandled option");
724                 }
725
726         if (arg_share_system)
727                 arg_register = false;
728
729         if (arg_boot && arg_share_system) {
730                 log_error("--boot and --share-system may not be combined.");
731                 return -EINVAL;
732         }
733
734         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
735                 log_error("--keep-unit may not be used when invoked from a user session.");
736                 return -EINVAL;
737         }
738
739         if (arg_directory && arg_image) {
740                 log_error("--directory= and --image= may not be combined.");
741                 return -EINVAL;
742         }
743
744         if (arg_template && arg_image) {
745                 log_error("--template= and --image= may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_template && !(arg_directory || arg_machine)) {
750                 log_error("--template= needs --directory= or --machine=.");
751                 return -EINVAL;
752         }
753
754         if (arg_ephemeral && arg_template) {
755                 log_error("--ephemeral and --template= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_ephemeral && arg_image) {
760                 log_error("--ephemeral and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
765                 log_error("--ephemeral and --link-journal= may not be combined.");
766                 return -EINVAL;
767         }
768
769         if (arg_volatile != VOLATILE_NO && arg_read_only) {
770                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
771                 return -EINVAL;
772         }
773
774         if (arg_expose_ports && !arg_private_network) {
775                 log_error("Cannot use --port= without private networking.");
776                 return -EINVAL;
777         }
778
779         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
780
781         return 1;
782 }
783
784 static int mount_all(const char *dest) {
785
786         typedef struct MountPoint {
787                 const char *what;
788                 const char *where;
789                 const char *type;
790                 const char *options;
791                 unsigned long flags;
792                 bool fatal;
793         } MountPoint;
794
795         static const MountPoint mount_table[] = {
796                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
797                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
798                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
799                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
800                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
801                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
802                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
803                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
804 #ifdef HAVE_SELINUX
805                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
806                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
807 #endif
808         };
809
810         unsigned k;
811         int r = 0;
812
813         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
814                 _cleanup_free_ char *where = NULL;
815 #ifdef HAVE_SELINUX
816                 _cleanup_free_ char *options = NULL;
817 #endif
818                 const char *o;
819                 int t;
820
821                 where = strjoin(dest, "/", mount_table[k].where, NULL);
822                 if (!where)
823                         return log_oom();
824
825                 t = path_is_mount_point(where, true);
826                 if (t < 0) {
827                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
828
829                         if (r == 0)
830                                 r = t;
831
832                         continue;
833                 }
834
835                 /* Skip this entry if it is not a remount. */
836                 if (mount_table[k].what && t > 0)
837                         continue;
838
839                 t = mkdir_p(where, 0755);
840                 if (t < 0) {
841                         if (mount_table[k].fatal) {
842                                log_error_errno(t, "Failed to create directory %s: %m", where);
843
844                                 if (r == 0)
845                                         r = t;
846                         } else
847                                log_warning_errno(t, "Failed to create directory %s: %m", where);
848
849                         continue;
850                 }
851
852 #ifdef HAVE_SELINUX
853                 if (arg_selinux_apifs_context &&
854                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
855                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
856                         if (!options)
857                                 return log_oom();
858
859                         o = options;
860                 } else
861 #endif
862                         o = mount_table[k].options;
863
864
865                 if (mount(mount_table[k].what,
866                           where,
867                           mount_table[k].type,
868                           mount_table[k].flags,
869                           o) < 0) {
870
871                         if (mount_table[k].fatal) {
872                                 log_error_errno(errno, "mount(%s) failed: %m", where);
873
874                                 if (r == 0)
875                                         r = -errno;
876                         } else
877                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
878                 }
879         }
880
881         return r;
882 }
883
884 static int mount_binds(const char *dest, char **l, bool ro) {
885         char **x, **y;
886
887         STRV_FOREACH_PAIR(x, y, l) {
888                 _cleanup_free_ char *where = NULL;
889                 struct stat source_st, dest_st;
890                 int r;
891
892                 if (stat(*x, &source_st) < 0)
893                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
894
895                 where = strappend(dest, *y);
896                 if (!where)
897                         return log_oom();
898
899                 r = stat(where, &dest_st);
900                 if (r == 0) {
901                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
902                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
903                                 return -EINVAL;
904                         }
905                 } else if (errno == ENOENT) {
906                         r = mkdir_parents_label(where, 0755);
907                         if (r < 0)
908                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
909                 } else {
910                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
911                         return -errno;
912                 }
913
914                 /* Create the mount point, but be conservative -- refuse to create block
915                  * and char devices. */
916                 if (S_ISDIR(source_st.st_mode)) {
917                         r = mkdir_label(where, 0755);
918                         if (r < 0 && errno != EEXIST)
919                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
920                 } else if (S_ISFIFO(source_st.st_mode)) {
921                         r = mkfifo(where, 0644);
922                         if (r < 0 && errno != EEXIST)
923                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
924                 } else if (S_ISSOCK(source_st.st_mode)) {
925                         r = mknod(where, 0644 | S_IFSOCK, 0);
926                         if (r < 0 && errno != EEXIST)
927                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
928                 } else if (S_ISREG(source_st.st_mode)) {
929                         r = touch(where);
930                         if (r < 0)
931                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
932                 } else {
933                         log_error("Refusing to create mountpoint for file: %s", *x);
934                         return -ENOTSUP;
935                 }
936
937                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
938                         return log_error_errno(errno, "mount(%s) failed: %m", where);
939
940                 if (ro) {
941                         r = bind_remount_recursive(where, true);
942                         if (r < 0)
943                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
944                 }
945         }
946
947         return 0;
948 }
949
950 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
951         char *to;
952         int r;
953
954         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
955
956         r = path_is_mount_point(to, false);
957         if (r < 0)
958                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
959         if (r > 0)
960                 return 0;
961
962         mkdir_p(to, 0755);
963
964         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
965                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
966
967         return 1;
968 }
969
970 static int mount_cgroup(const char *dest) {
971         _cleanup_set_free_free_ Set *controllers = NULL;
972         _cleanup_free_ char *own_cgroup_path = NULL;
973         const char *cgroup_root, *systemd_root, *systemd_own;
974         int r;
975
976         controllers = set_new(&string_hash_ops);
977         if (!controllers)
978                 return log_oom();
979
980         r = cg_kernel_controllers(controllers);
981         if (r < 0)
982                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
983
984         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
985         if (r < 0)
986                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
987
988         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
989         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
990                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
991
992         for (;;) {
993                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
994
995                 controller = set_steal_first(controllers);
996                 if (!controller)
997                         break;
998
999                 origin = strappend("/sys/fs/cgroup/", controller);
1000                 if (!origin)
1001                         return log_oom();
1002
1003                 r = readlink_malloc(origin, &combined);
1004                 if (r == -EINVAL) {
1005                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1006
1007                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1008                         if (r < 0)
1009                                 return r;
1010
1011                 } else if (r < 0)
1012                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1013                 else {
1014                         _cleanup_free_ char *target = NULL;
1015
1016                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1017                         if (!target)
1018                                 return log_oom();
1019
1020                         /* A symbolic link, a combination of controllers in one hierarchy */
1021
1022                         if (!filename_is_valid(combined)) {
1023                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1024                                 continue;
1025                         }
1026
1027                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1028                         if (r < 0)
1029                                 return r;
1030
1031                         if (symlink(combined, target) < 0)
1032                                 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
1033                 }
1034         }
1035
1036         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1037         if (r < 0)
1038                 return r;
1039
1040         /* Make our own cgroup a (writable) bind mount */
1041         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1042         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1043                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1044
1045         /* And then remount the systemd cgroup root read-only */
1046         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1047         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1048                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1049
1050         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1051                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1052
1053         return 0;
1054 }
1055
1056 static int mount_tmpfs(const char *dest) {
1057         char **i, **o;
1058
1059         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1060                 _cleanup_free_ char *where = NULL;
1061                 int r;
1062
1063                 where = strappend(dest, *i);
1064                 if (!where)
1065                         return log_oom();
1066
1067                 r = mkdir_label(where, 0755);
1068                 if (r < 0 && r != -EEXIST)
1069                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1070
1071                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1072                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1073         }
1074
1075         return 0;
1076 }
1077
1078 static int setup_timezone(const char *dest) {
1079         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1080         char *z, *y;
1081         int r;
1082
1083         assert(dest);
1084
1085         /* Fix the timezone, if possible */
1086         r = readlink_malloc("/etc/localtime", &p);
1087         if (r < 0) {
1088                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1089                 return 0;
1090         }
1091
1092         z = path_startswith(p, "../usr/share/zoneinfo/");
1093         if (!z)
1094                 z = path_startswith(p, "/usr/share/zoneinfo/");
1095         if (!z) {
1096                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1097                 return 0;
1098         }
1099
1100         where = strappend(dest, "/etc/localtime");
1101         if (!where)
1102                 return log_oom();
1103
1104         r = readlink_malloc(where, &q);
1105         if (r >= 0) {
1106                 y = path_startswith(q, "../usr/share/zoneinfo/");
1107                 if (!y)
1108                         y = path_startswith(q, "/usr/share/zoneinfo/");
1109
1110                 /* Already pointing to the right place? Then do nothing .. */
1111                 if (y && streq(y, z))
1112                         return 0;
1113         }
1114
1115         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1116         if (!check)
1117                 return log_oom();
1118
1119         if (access(check, F_OK) < 0) {
1120                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1121                 return 0;
1122         }
1123
1124         what = strappend("../usr/share/zoneinfo/", z);
1125         if (!what)
1126                 return log_oom();
1127
1128         r = mkdir_parents(where, 0755);
1129         if (r < 0) {
1130                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1131
1132                 return 0;
1133         }
1134
1135         r = unlink(where);
1136         if (r < 0 && errno != ENOENT) {
1137                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1138
1139                 return 0;
1140         }
1141
1142         if (symlink(what, where) < 0) {
1143                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1144                 return 0;
1145         }
1146
1147         return 0;
1148 }
1149
1150 static int setup_resolv_conf(const char *dest) {
1151         _cleanup_free_ char *where = NULL;
1152         int r;
1153
1154         assert(dest);
1155
1156         if (arg_private_network)
1157                 return 0;
1158
1159         /* Fix resolv.conf, if possible */
1160         where = strappend(dest, "/etc/resolv.conf");
1161         if (!where)
1162                 return log_oom();
1163
1164         /* We don't really care for the results of this really. If it
1165          * fails, it fails, but meh... */
1166         r = mkdir_parents(where, 0755);
1167         if (r < 0) {
1168                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1169
1170                 return 0;
1171         }
1172
1173         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1174         if (r < 0) {
1175                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1176
1177                 return 0;
1178         }
1179
1180         return 0;
1181 }
1182
1183 static int setup_volatile_state(const char *directory) {
1184         const char *p;
1185         int r;
1186
1187         assert(directory);
1188
1189         if (arg_volatile != VOLATILE_STATE)
1190                 return 0;
1191
1192         /* --volatile=state means we simply overmount /var
1193            with a tmpfs, and the rest read-only. */
1194
1195         r = bind_remount_recursive(directory, true);
1196         if (r < 0)
1197                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1198
1199         p = strappenda(directory, "/var");
1200         r = mkdir(p, 0755);
1201         if (r < 0 && errno != EEXIST)
1202                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1203
1204         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1205                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1206
1207         return 0;
1208 }
1209
1210 static int setup_volatile(const char *directory) {
1211         bool tmpfs_mounted = false, bind_mounted = false;
1212         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1213         const char *f, *t;
1214         int r;
1215
1216         assert(directory);
1217
1218         if (arg_volatile != VOLATILE_YES)
1219                 return 0;
1220
1221         /* --volatile=yes means we mount a tmpfs to the root dir, and
1222            the original /usr to use inside it, and that read-only. */
1223
1224         if (!mkdtemp(template))
1225                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1226
1227         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1228                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1229                 r = -errno;
1230                 goto fail;
1231         }
1232
1233         tmpfs_mounted = true;
1234
1235         f = strappenda(directory, "/usr");
1236         t = strappenda(template, "/usr");
1237
1238         r = mkdir(t, 0755);
1239         if (r < 0 && errno != EEXIST) {
1240                 log_error_errno(errno, "Failed to create %s: %m", t);
1241                 r = -errno;
1242                 goto fail;
1243         }
1244
1245         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1246                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1247                 r = -errno;
1248                 goto fail;
1249         }
1250
1251         bind_mounted = true;
1252
1253         r = bind_remount_recursive(t, true);
1254         if (r < 0) {
1255                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1256                 goto fail;
1257         }
1258
1259         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1260                 log_error_errno(errno, "Failed to move root mount: %m");
1261                 r = -errno;
1262                 goto fail;
1263         }
1264
1265         rmdir(template);
1266
1267         return 0;
1268
1269 fail:
1270         if (bind_mounted)
1271                 umount(t);
1272         if (tmpfs_mounted)
1273                 umount(template);
1274         rmdir(template);
1275         return r;
1276 }
1277
1278 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1279
1280         snprintf(s, 37,
1281                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1282                  SD_ID128_FORMAT_VAL(id));
1283
1284         return s;
1285 }
1286
1287 static int setup_boot_id(const char *dest) {
1288         _cleanup_free_ char *from = NULL, *to = NULL;
1289         sd_id128_t rnd = {};
1290         char as_uuid[37];
1291         int r;
1292
1293         assert(dest);
1294
1295         if (arg_share_system)
1296                 return 0;
1297
1298         /* Generate a new randomized boot ID, so that each boot-up of
1299          * the container gets a new one */
1300
1301         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1302         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1303         if (!from || !to)
1304                 return log_oom();
1305
1306         r = sd_id128_randomize(&rnd);
1307         if (r < 0)
1308                 return log_error_errno(r, "Failed to generate random boot id: %m");
1309
1310         id128_format_as_uuid(rnd, as_uuid);
1311
1312         r = write_string_file(from, as_uuid);
1313         if (r < 0)
1314                 return log_error_errno(r, "Failed to write boot id: %m");
1315
1316         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1317                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1318                 r = -errno;
1319         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1320                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1321
1322         unlink(from);
1323         return r;
1324 }
1325
1326 static int copy_devnodes(const char *dest) {
1327
1328         static const char devnodes[] =
1329                 "null\0"
1330                 "zero\0"
1331                 "full\0"
1332                 "random\0"
1333                 "urandom\0"
1334                 "tty\0"
1335                 "net/tun\0";
1336
1337         const char *d;
1338         int r = 0;
1339         _cleanup_umask_ mode_t u;
1340
1341         assert(dest);
1342
1343         u = umask(0000);
1344
1345         NULSTR_FOREACH(d, devnodes) {
1346                 _cleanup_free_ char *from = NULL, *to = NULL;
1347                 struct stat st;
1348
1349                 from = strappend("/dev/", d);
1350                 to = strjoin(dest, "/dev/", d, NULL);
1351                 if (!from || !to)
1352                         return log_oom();
1353
1354                 if (stat(from, &st) < 0) {
1355
1356                         if (errno != ENOENT)
1357                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1358
1359                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1360
1361                         log_error("%s is not a char or block device, cannot copy", from);
1362                         return -EIO;
1363
1364                 } else {
1365                         r = mkdir_parents(to, 0775);
1366                         if (r < 0) {
1367                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1368                                 return -r;
1369                         }
1370
1371                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1372                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1373                 }
1374         }
1375
1376         return r;
1377 }
1378
1379 static int setup_ptmx(const char *dest) {
1380         _cleanup_free_ char *p = NULL;
1381
1382         p = strappend(dest, "/dev/ptmx");
1383         if (!p)
1384                 return log_oom();
1385
1386         if (symlink("pts/ptmx", p) < 0)
1387                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1388
1389         return 0;
1390 }
1391
1392 static int setup_dev_console(const char *dest, const char *console) {
1393         _cleanup_umask_ mode_t u;
1394         const char *to;
1395         struct stat st;
1396         int r;
1397
1398         assert(dest);
1399         assert(console);
1400
1401         u = umask(0000);
1402
1403         if (stat("/dev/null", &st) < 0)
1404                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1405
1406         r = chmod_and_chown(console, 0600, 0, 0);
1407         if (r < 0)
1408                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1409
1410         /* We need to bind mount the right tty to /dev/console since
1411          * ptys can only exist on pts file systems. To have something
1412          * to bind mount things on we create a device node first, and
1413          * use /dev/null for that since we the cgroups device policy
1414          * allows us to create that freely, while we cannot create
1415          * /dev/console. (Note that the major minor doesn't actually
1416          * matter here, since we mount it over anyway). */
1417
1418         to = strappenda(dest, "/dev/console");
1419         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1420                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1421
1422         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1423                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1424
1425         return 0;
1426 }
1427
1428 static int setup_kmsg(const char *dest, int kmsg_socket) {
1429         _cleanup_free_ char *from = NULL, *to = NULL;
1430         _cleanup_umask_ mode_t u;
1431         int r, fd, k;
1432         union {
1433                 struct cmsghdr cmsghdr;
1434                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1435         } control = {};
1436         struct msghdr mh = {
1437                 .msg_control = &control,
1438                 .msg_controllen = sizeof(control),
1439         };
1440         struct cmsghdr *cmsg;
1441
1442         assert(dest);
1443         assert(kmsg_socket >= 0);
1444
1445         u = umask(0000);
1446
1447         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1448          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1449          * on the reading side behave very similar to /proc/kmsg,
1450          * their writing side behaves differently from /dev/kmsg in
1451          * that writing blocks when nothing is reading. In order to
1452          * avoid any problems with containers deadlocking due to this
1453          * we simply make /dev/kmsg unavailable to the container. */
1454         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1455             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1456                 return log_oom();
1457
1458         if (mkfifo(from, 0600) < 0)
1459                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1460
1461         r = chmod_and_chown(from, 0600, 0, 0);
1462         if (r < 0)
1463                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1464
1465         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1466                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1467
1468         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1469         if (fd < 0)
1470                 return log_error_errno(errno, "Failed to open fifo: %m");
1471
1472         cmsg = CMSG_FIRSTHDR(&mh);
1473         cmsg->cmsg_level = SOL_SOCKET;
1474         cmsg->cmsg_type = SCM_RIGHTS;
1475         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1476         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1477
1478         mh.msg_controllen = cmsg->cmsg_len;
1479
1480         /* Store away the fd in the socket, so that it stays open as
1481          * long as we run the child */
1482         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1483         safe_close(fd);
1484
1485         if (k < 0)
1486                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1487
1488         /* And now make the FIFO unavailable as /dev/kmsg... */
1489         unlink(from);
1490         return 0;
1491 }
1492
1493 static int send_rtnl(int send_fd) {
1494         union {
1495                 struct cmsghdr cmsghdr;
1496                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1497         } control = {};
1498         struct msghdr mh = {
1499                 .msg_control = &control,
1500                 .msg_controllen = sizeof(control),
1501         };
1502         struct cmsghdr *cmsg;
1503         _cleanup_close_ int fd = -1;
1504         ssize_t k;
1505
1506         assert(send_fd >= 0);
1507
1508         if (!arg_expose_ports)
1509                 return 0;
1510
1511         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1512         if (fd < 0)
1513                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1514
1515         cmsg = CMSG_FIRSTHDR(&mh);
1516         cmsg->cmsg_level = SOL_SOCKET;
1517         cmsg->cmsg_type = SCM_RIGHTS;
1518         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1519         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1520
1521         mh.msg_controllen = cmsg->cmsg_len;
1522
1523         /* Store away the fd in the socket, so that it stays open as
1524          * long as we run the child */
1525         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1526         if (k < 0)
1527                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1528
1529         return 0;
1530 }
1531
1532 static int flush_ports(union in_addr_union *exposed) {
1533         ExposePort *p;
1534         int r, af = AF_INET;
1535
1536         assert(exposed);
1537
1538         if (!arg_expose_ports)
1539                 return 0;
1540
1541         if (in_addr_is_null(af, exposed))
1542                 return 0;
1543
1544         log_debug("Lost IP address.");
1545
1546         LIST_FOREACH(ports, p, arg_expose_ports) {
1547                 r = fw_add_local_dnat(false,
1548                                       af,
1549                                       p->protocol,
1550                                       NULL,
1551                                       NULL, 0,
1552                                       NULL, 0,
1553                                       p->host_port,
1554                                       exposed,
1555                                       p->container_port,
1556                                       NULL);
1557                 if (r < 0)
1558                         log_warning_errno(r, "Failed to modify firewall: %m");
1559         }
1560
1561         *exposed = IN_ADDR_NULL;
1562         return 0;
1563 }
1564
1565 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1566         _cleanup_free_ struct local_address *addresses = NULL;
1567         _cleanup_free_ char *pretty = NULL;
1568         union in_addr_union new_exposed;
1569         ExposePort *p;
1570         bool add;
1571         int af = AF_INET, r;
1572
1573         assert(exposed);
1574
1575         /* Invoked each time an address is added or removed inside the
1576          * container */
1577
1578         if (!arg_expose_ports)
1579                 return 0;
1580
1581         r = local_addresses(rtnl, 0, af, &addresses);
1582         if (r < 0)
1583                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1584
1585         add = r > 0 &&
1586                 addresses[0].family == af &&
1587                 addresses[0].scope < RT_SCOPE_LINK;
1588
1589         if (!add)
1590                 return flush_ports(exposed);
1591
1592         new_exposed = addresses[0].address;
1593         if (in_addr_equal(af, exposed, &new_exposed))
1594                 return 0;
1595
1596         in_addr_to_string(af, &new_exposed, &pretty);
1597         log_debug("New container IP is %s.", strna(pretty));
1598
1599         LIST_FOREACH(ports, p, arg_expose_ports) {
1600
1601                 r = fw_add_local_dnat(true,
1602                                       af,
1603                                       p->protocol,
1604                                       NULL,
1605                                       NULL, 0,
1606                                       NULL, 0,
1607                                       p->host_port,
1608                                       &new_exposed,
1609                                       p->container_port,
1610                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1611                 if (r < 0)
1612                         log_warning_errno(r, "Failed to modify firewall: %m");
1613         }
1614
1615         *exposed = new_exposed;
1616         return 0;
1617 }
1618
1619 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1620         union in_addr_union *exposed = userdata;
1621
1622         assert(rtnl);
1623         assert(m);
1624         assert(exposed);
1625
1626         expose_ports(rtnl, exposed);
1627         return 0;
1628 }
1629
1630 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1631         union {
1632                 struct cmsghdr cmsghdr;
1633                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1634         } control = {};
1635         struct msghdr mh = {
1636                 .msg_control = &control,
1637                 .msg_controllen = sizeof(control),
1638         };
1639         struct cmsghdr *cmsg;
1640         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1641         int fd, r;
1642         ssize_t k;
1643
1644         assert(event);
1645         assert(recv_fd >= 0);
1646         assert(ret);
1647
1648         if (!arg_expose_ports)
1649                 return 0;
1650
1651         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1652         if (k < 0)
1653                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1654
1655         cmsg = CMSG_FIRSTHDR(&mh);
1656         assert(cmsg->cmsg_level == SOL_SOCKET);
1657         assert(cmsg->cmsg_type == SCM_RIGHTS);
1658         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1659         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1660
1661         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1662         if (r < 0) {
1663                 safe_close(fd);
1664                 return log_error_errno(r, "Failed to create rtnl object: %m");
1665         }
1666
1667         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1668         if (r < 0)
1669                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1670
1671         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1672         if (r < 0)
1673                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1674
1675         r = sd_rtnl_attach_event(rtnl, event, 0);
1676         if (r < 0)
1677                 return log_error_errno(r, "Failed to add to even loop: %m");
1678
1679         *ret = rtnl;
1680         rtnl = NULL;
1681
1682         return 0;
1683 }
1684
1685 static int setup_hostname(void) {
1686
1687         if (arg_share_system)
1688                 return 0;
1689
1690         if (sethostname_idempotent(arg_machine) < 0)
1691                 return -errno;
1692
1693         return 0;
1694 }
1695
1696 static int setup_journal(const char *directory) {
1697         sd_id128_t machine_id, this_id;
1698         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1699         char *id;
1700         int r;
1701
1702         /* Don't link journals in ephemeral mode */
1703         if (arg_ephemeral)
1704                 return 0;
1705
1706         p = strappend(directory, "/etc/machine-id");
1707         if (!p)
1708                 return log_oom();
1709
1710         r = read_one_line_file(p, &b);
1711         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1712                 return 0;
1713         else if (r < 0)
1714                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1715
1716         id = strstrip(b);
1717         if (isempty(id) && arg_link_journal == LINK_AUTO)
1718                 return 0;
1719
1720         /* Verify validity */
1721         r = sd_id128_from_string(id, &machine_id);
1722         if (r < 0)
1723                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1724
1725         r = sd_id128_get_machine(&this_id);
1726         if (r < 0)
1727                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1728
1729         if (sd_id128_equal(machine_id, this_id)) {
1730                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1731                          "Host and machine ids are equal (%s): refusing to link journals", id);
1732                 if (arg_link_journal == LINK_AUTO)
1733                         return 0;
1734                 return -EEXIST;
1735         }
1736
1737         if (arg_link_journal == LINK_NO)
1738                 return 0;
1739
1740         free(p);
1741         p = strappend("/var/log/journal/", id);
1742         q = strjoin(directory, "/var/log/journal/", id, NULL);
1743         if (!p || !q)
1744                 return log_oom();
1745
1746         if (path_is_mount_point(p, false) > 0) {
1747                 if (arg_link_journal != LINK_AUTO) {
1748                         log_error("%s: already a mount point, refusing to use for journal", p);
1749                         return -EEXIST;
1750                 }
1751
1752                 return 0;
1753         }
1754
1755         if (path_is_mount_point(q, false) > 0) {
1756                 if (arg_link_journal != LINK_AUTO) {
1757                         log_error("%s: already a mount point, refusing to use for journal", q);
1758                         return -EEXIST;
1759                 }
1760
1761                 return 0;
1762         }
1763
1764         r = readlink_and_make_absolute(p, &d);
1765         if (r >= 0) {
1766                 if ((arg_link_journal == LINK_GUEST ||
1767                      arg_link_journal == LINK_AUTO) &&
1768                     path_equal(d, q)) {
1769
1770                         r = mkdir_p(q, 0755);
1771                         if (r < 0)
1772                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1773                         return 0;
1774                 }
1775
1776                 if (unlink(p) < 0)
1777                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1778         } else if (r == -EINVAL) {
1779
1780                 if (arg_link_journal == LINK_GUEST &&
1781                     rmdir(p) < 0) {
1782
1783                         if (errno == ENOTDIR) {
1784                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1785                                 return r;
1786                         } else {
1787                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1788                                 return -errno;
1789                         }
1790                 }
1791         } else if (r != -ENOENT) {
1792                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1793                 return r;
1794         }
1795
1796         if (arg_link_journal == LINK_GUEST) {
1797
1798                 if (symlink(q, p) < 0) {
1799                         if (arg_link_journal_try) {
1800                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1801                                 return 0;
1802                         } else {
1803                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1804                                 return -errno;
1805                         }
1806                 }
1807
1808                 r = mkdir_p(q, 0755);
1809                 if (r < 0)
1810                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1811                 return 0;
1812         }
1813
1814         if (arg_link_journal == LINK_HOST) {
1815                 /* don't create parents here -- if the host doesn't have
1816                  * permanent journal set up, don't force it here */
1817                 r = mkdir(p, 0755);
1818                 if (r < 0) {
1819                         if (arg_link_journal_try) {
1820                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1821                                 return 0;
1822                         } else {
1823                                 log_error_errno(errno, "Failed to create %s: %m", p);
1824                                 return r;
1825                         }
1826                 }
1827
1828         } else if (access(p, F_OK) < 0)
1829                 return 0;
1830
1831         if (dir_is_empty(q) == 0)
1832                 log_warning("%s is not empty, proceeding anyway.", q);
1833
1834         r = mkdir_p(q, 0755);
1835         if (r < 0) {
1836                 log_error_errno(errno, "Failed to create %s: %m", q);
1837                 return r;
1838         }
1839
1840         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1841                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1842
1843         return 0;
1844 }
1845
1846 static int drop_capabilities(void) {
1847         return capability_bounding_set_drop(~arg_retain, false);
1848 }
1849
1850 static int register_machine(pid_t pid, int local_ifindex) {
1851         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1852         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1853         int r;
1854
1855         if (!arg_register)
1856                 return 0;
1857
1858         r = sd_bus_default_system(&bus);
1859         if (r < 0)
1860                 return log_error_errno(r, "Failed to open system bus: %m");
1861
1862         if (arg_keep_unit) {
1863                 r = sd_bus_call_method(
1864                                 bus,
1865                                 "org.freedesktop.machine1",
1866                                 "/org/freedesktop/machine1",
1867                                 "org.freedesktop.machine1.Manager",
1868                                 "RegisterMachineWithNetwork",
1869                                 &error,
1870                                 NULL,
1871                                 "sayssusai",
1872                                 arg_machine,
1873                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1874                                 "nspawn",
1875                                 "container",
1876                                 (uint32_t) pid,
1877                                 strempty(arg_directory),
1878                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1879         } else {
1880                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1881
1882                 r = sd_bus_message_new_method_call(
1883                                 bus,
1884                                 &m,
1885                                 "org.freedesktop.machine1",
1886                                 "/org/freedesktop/machine1",
1887                                 "org.freedesktop.machine1.Manager",
1888                                 "CreateMachineWithNetwork");
1889                 if (r < 0)
1890                         return log_error_errno(r, "Failed to create message: %m");
1891
1892                 r = sd_bus_message_append(
1893                                 m,
1894                                 "sayssusai",
1895                                 arg_machine,
1896                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1897                                 "nspawn",
1898                                 "container",
1899                                 (uint32_t) pid,
1900                                 strempty(arg_directory),
1901                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1902                 if (r < 0)
1903                         return log_error_errno(r, "Failed to append message arguments: %m");
1904
1905                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1906                 if (r < 0)
1907                         return log_error_errno(r, "Failed to open container: %m");
1908
1909                 if (!isempty(arg_slice)) {
1910                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1911                         if (r < 0)
1912                                 return log_error_errno(r, "Failed to append slice: %m");
1913                 }
1914
1915                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1916                 if (r < 0)
1917                         return log_error_errno(r, "Failed to add device policy: %m");
1918
1919                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1920                                           /* Allow the container to
1921                                            * access and create the API
1922                                            * device nodes, so that
1923                                            * PrivateDevices= in the
1924                                            * container can work
1925                                            * fine */
1926                                           "/dev/null", "rwm",
1927                                           "/dev/zero", "rwm",
1928                                           "/dev/full", "rwm",
1929                                           "/dev/random", "rwm",
1930                                           "/dev/urandom", "rwm",
1931                                           "/dev/tty", "rwm",
1932                                           "/dev/net/tun", "rwm",
1933                                           /* Allow the container
1934                                            * access to ptys. However,
1935                                            * do not permit the
1936                                            * container to ever create
1937                                            * these device nodes. */
1938                                           "/dev/pts/ptmx", "rw",
1939                                           "char-pts", "rw");
1940                 if (r < 0)
1941                         return log_error_errno(r, "Failed to add device whitelist: %m");
1942
1943                 r = sd_bus_message_close_container(m);
1944                 if (r < 0)
1945                         return log_error_errno(r, "Failed to close container: %m");
1946
1947                 r = sd_bus_call(bus, m, 0, &error, NULL);
1948         }
1949
1950         if (r < 0) {
1951                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1952                 return r;
1953         }
1954
1955         return 0;
1956 }
1957
1958 static int terminate_machine(pid_t pid) {
1959         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1960         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1961         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1962         const char *path;
1963         int r;
1964
1965         if (!arg_register)
1966                 return 0;
1967
1968         r = sd_bus_default_system(&bus);
1969         if (r < 0)
1970                 return log_error_errno(r, "Failed to open system bus: %m");
1971
1972         r = sd_bus_call_method(
1973                         bus,
1974                         "org.freedesktop.machine1",
1975                         "/org/freedesktop/machine1",
1976                         "org.freedesktop.machine1.Manager",
1977                         "GetMachineByPID",
1978                         &error,
1979                         &reply,
1980                         "u",
1981                         (uint32_t) pid);
1982         if (r < 0) {
1983                 /* Note that the machine might already have been
1984                  * cleaned up automatically, hence don't consider it a
1985                  * failure if we cannot get the machine object. */
1986                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1987                 return 0;
1988         }
1989
1990         r = sd_bus_message_read(reply, "o", &path);
1991         if (r < 0)
1992                 return bus_log_parse_error(r);
1993
1994         r = sd_bus_call_method(
1995                         bus,
1996                         "org.freedesktop.machine1",
1997                         path,
1998                         "org.freedesktop.machine1.Machine",
1999                         "Terminate",
2000                         &error,
2001                         NULL,
2002                         NULL);
2003         if (r < 0) {
2004                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2005                 return 0;
2006         }
2007
2008         return 0;
2009 }
2010
2011 static int reset_audit_loginuid(void) {
2012         _cleanup_free_ char *p = NULL;
2013         int r;
2014
2015         if (arg_share_system)
2016                 return 0;
2017
2018         r = read_one_line_file("/proc/self/loginuid", &p);
2019         if (r == -ENOENT)
2020                 return 0;
2021         if (r < 0)
2022                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2023
2024         /* Already reset? */
2025         if (streq(p, "4294967295"))
2026                 return 0;
2027
2028         r = write_string_file("/proc/self/loginuid", "4294967295");
2029         if (r < 0) {
2030                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2031                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2032                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2033                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2034                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2035
2036                 sleep(5);
2037         }
2038
2039         return 0;
2040 }
2041
2042 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2043 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2044 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2045
2046 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2047         uint8_t result[8];
2048         size_t l, sz;
2049         uint8_t *v, *i;
2050         int r;
2051
2052         l = strlen(arg_machine);
2053         sz = sizeof(sd_id128_t) + l;
2054         if (idx > 0)
2055                 sz += sizeof(idx);
2056
2057         v = alloca(sz);
2058
2059         /* fetch some persistent data unique to the host */
2060         r = sd_id128_get_machine((sd_id128_t*) v);
2061         if (r < 0)
2062                 return r;
2063
2064         /* combine with some data unique (on this host) to this
2065          * container instance */
2066         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2067         if (idx > 0) {
2068                 idx = htole64(idx);
2069                 memcpy(i, &idx, sizeof(idx));
2070         }
2071
2072         /* Let's hash the host machine ID plus the container name. We
2073          * use a fixed, but originally randomly created hash key here. */
2074         siphash24(result, v, sz, hash_key.bytes);
2075
2076         assert_cc(ETH_ALEN <= sizeof(result));
2077         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2078
2079         /* see eth_random_addr in the kernel */
2080         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2081         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2082
2083         return 0;
2084 }
2085
2086 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2087         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2088         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2089         struct ether_addr mac_host, mac_container;
2090         int r, i;
2091
2092         if (!arg_private_network)
2093                 return 0;
2094
2095         if (!arg_network_veth)
2096                 return 0;
2097
2098         /* Use two different interface name prefixes depending whether
2099          * we are in bridge mode or not. */
2100         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2101                  arg_network_bridge ? "vb" : "ve", arg_machine);
2102
2103         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2104         if (r < 0)
2105                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2106
2107         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2108         if (r < 0)
2109                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2110
2111         r = sd_rtnl_open(&rtnl, 0);
2112         if (r < 0)
2113                 return log_error_errno(r, "Failed to connect to netlink: %m");
2114
2115         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2116         if (r < 0)
2117                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2118
2119         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2120         if (r < 0)
2121                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2122
2123         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2124         if (r < 0)
2125                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2126
2127         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2128         if (r < 0)
2129                 return log_error_errno(r, "Failed to open netlink container: %m");
2130
2131         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2132         if (r < 0)
2133                 return log_error_errno(r, "Failed to open netlink container: %m");
2134
2135         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2136         if (r < 0)
2137                 return log_error_errno(r, "Failed to open netlink container: %m");
2138
2139         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2140         if (r < 0)
2141                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2142
2143         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2144         if (r < 0)
2145                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2146
2147         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2148         if (r < 0)
2149                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2150
2151         r = sd_rtnl_message_close_container(m);
2152         if (r < 0)
2153                 return log_error_errno(r, "Failed to close netlink container: %m");
2154
2155         r = sd_rtnl_message_close_container(m);
2156         if (r < 0)
2157                 return log_error_errno(r, "Failed to close netlink container: %m");
2158
2159         r = sd_rtnl_message_close_container(m);
2160         if (r < 0)
2161                 return log_error_errno(r, "Failed to close netlink container: %m");
2162
2163         r = sd_rtnl_call(rtnl, m, 0, NULL);
2164         if (r < 0)
2165                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2166
2167         i = (int) if_nametoindex(iface_name);
2168         if (i <= 0)
2169                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2170
2171         *ifi = i;
2172
2173         return 0;
2174 }
2175
2176 static int setup_bridge(const char veth_name[], int *ifi) {
2177         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2178         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2179         int r, bridge;
2180
2181         if (!arg_private_network)
2182                 return 0;
2183
2184         if (!arg_network_veth)
2185                 return 0;
2186
2187         if (!arg_network_bridge)
2188                 return 0;
2189
2190         bridge = (int) if_nametoindex(arg_network_bridge);
2191         if (bridge <= 0)
2192                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2193
2194         *ifi = bridge;
2195
2196         r = sd_rtnl_open(&rtnl, 0);
2197         if (r < 0)
2198                 return log_error_errno(r, "Failed to connect to netlink: %m");
2199
2200         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2201         if (r < 0)
2202                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2203
2204         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2205         if (r < 0)
2206                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2207
2208         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2209         if (r < 0)
2210                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2211
2212         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2213         if (r < 0)
2214                 return log_error_errno(r, "Failed to add netlink master field: %m");
2215
2216         r = sd_rtnl_call(rtnl, m, 0, NULL);
2217         if (r < 0)
2218                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2219
2220         return 0;
2221 }
2222
2223 static int parse_interface(struct udev *udev, const char *name) {
2224         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2225         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2226         int ifi;
2227
2228         ifi = (int) if_nametoindex(name);
2229         if (ifi <= 0)
2230                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2231
2232         sprintf(ifi_str, "n%i", ifi);
2233         d = udev_device_new_from_device_id(udev, ifi_str);
2234         if (!d)
2235                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2236
2237         if (udev_device_get_is_initialized(d) <= 0) {
2238                 log_error("Network interface %s is not initialized yet.", name);
2239                 return -EBUSY;
2240         }
2241
2242         return ifi;
2243 }
2244
2245 static int move_network_interfaces(pid_t pid) {
2246         _cleanup_udev_unref_ struct udev *udev = NULL;
2247         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2248         char **i;
2249         int r;
2250
2251         if (!arg_private_network)
2252                 return 0;
2253
2254         if (strv_isempty(arg_network_interfaces))
2255                 return 0;
2256
2257         r = sd_rtnl_open(&rtnl, 0);
2258         if (r < 0)
2259                 return log_error_errno(r, "Failed to connect to netlink: %m");
2260
2261         udev = udev_new();
2262         if (!udev) {
2263                 log_error("Failed to connect to udev.");
2264                 return -ENOMEM;
2265         }
2266
2267         STRV_FOREACH(i, arg_network_interfaces) {
2268                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2269                 int ifi;
2270
2271                 ifi = parse_interface(udev, *i);
2272                 if (ifi < 0)
2273                         return ifi;
2274
2275                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2276                 if (r < 0)
2277                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2278
2279                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2280                 if (r < 0)
2281                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2282
2283                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2284                 if (r < 0)
2285                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2286         }
2287
2288         return 0;
2289 }
2290
2291 static int setup_macvlan(pid_t pid) {
2292         _cleanup_udev_unref_ struct udev *udev = NULL;
2293         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2294         unsigned idx = 0;
2295         char **i;
2296         int r;
2297
2298         if (!arg_private_network)
2299                 return 0;
2300
2301         if (strv_isempty(arg_network_macvlan))
2302                 return 0;
2303
2304         r = sd_rtnl_open(&rtnl, 0);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to connect to netlink: %m");
2307
2308         udev = udev_new();
2309         if (!udev) {
2310                 log_error("Failed to connect to udev.");
2311                 return -ENOMEM;
2312         }
2313
2314         STRV_FOREACH(i, arg_network_macvlan) {
2315                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2316                 _cleanup_free_ char *n = NULL;
2317                 struct ether_addr mac;
2318                 int ifi;
2319
2320                 ifi = parse_interface(udev, *i);
2321                 if (ifi < 0)
2322                         return ifi;
2323
2324                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2325                 if (r < 0)
2326                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2327
2328                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2329                 if (r < 0)
2330                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2331
2332                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2333                 if (r < 0)
2334                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2335
2336                 n = strappend("mv-", *i);
2337                 if (!n)
2338                         return log_oom();
2339
2340                 strshorten(n, IFNAMSIZ-1);
2341
2342                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2343                 if (r < 0)
2344                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2345
2346                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2347                 if (r < 0)
2348                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2349
2350                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2351                 if (r < 0)
2352                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2353
2354                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2355                 if (r < 0)
2356                         return log_error_errno(r, "Failed to open netlink container: %m");
2357
2358                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2359                 if (r < 0)
2360                         return log_error_errno(r, "Failed to open netlink container: %m");
2361
2362                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2363                 if (r < 0)
2364                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2365
2366                 r = sd_rtnl_message_close_container(m);
2367                 if (r < 0)
2368                         return log_error_errno(r, "Failed to close netlink container: %m");
2369
2370                 r = sd_rtnl_message_close_container(m);
2371                 if (r < 0)
2372                         return log_error_errno(r, "Failed to close netlink container: %m");
2373
2374                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2375                 if (r < 0)
2376                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2377         }
2378
2379         return 0;
2380 }
2381
2382 static int setup_seccomp(void) {
2383
2384 #ifdef HAVE_SECCOMP
2385         static const int blacklist[] = {
2386                 SCMP_SYS(kexec_load),
2387                 SCMP_SYS(open_by_handle_at),
2388                 SCMP_SYS(init_module),
2389                 SCMP_SYS(finit_module),
2390                 SCMP_SYS(delete_module),
2391                 SCMP_SYS(iopl),
2392                 SCMP_SYS(ioperm),
2393                 SCMP_SYS(swapon),
2394                 SCMP_SYS(swapoff),
2395         };
2396
2397         scmp_filter_ctx seccomp;
2398         unsigned i;
2399         int r;
2400
2401         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2402         if (!seccomp)
2403                 return log_oom();
2404
2405         r = seccomp_add_secondary_archs(seccomp);
2406         if (r < 0) {
2407                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2408                 goto finish;
2409         }
2410
2411         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2412                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2413                 if (r == -EFAULT)
2414                         continue; /* unknown syscall */
2415                 if (r < 0) {
2416                         log_error_errno(r, "Failed to block syscall: %m");
2417                         goto finish;
2418                 }
2419         }
2420
2421         /*
2422            Audit is broken in containers, much of the userspace audit
2423            hookup will fail if running inside a container. We don't
2424            care and just turn off creation of audit sockets.
2425
2426            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2427            with EAFNOSUPPORT which audit userspace uses as indication
2428            that audit is disabled in the kernel.
2429          */
2430
2431         r = seccomp_rule_add(
2432                         seccomp,
2433                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2434                         SCMP_SYS(socket),
2435                         2,
2436                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2437                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2438         if (r < 0) {
2439                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2440                 goto finish;
2441         }
2442
2443         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2444         if (r < 0) {
2445                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2446                 goto finish;
2447         }
2448
2449         r = seccomp_load(seccomp);
2450         if (r < 0)
2451                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2452
2453 finish:
2454         seccomp_release(seccomp);
2455         return r;
2456 #else
2457         return 0;
2458 #endif
2459
2460 }
2461
2462 static int setup_propagate(const char *root) {
2463         const char *p, *q;
2464
2465         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2466         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2467         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2468         (void) mkdir_p(p, 0600);
2469
2470         q = strappenda(root, "/run/systemd/nspawn/incoming");
2471         mkdir_parents(q, 0755);
2472         mkdir_p(q, 0600);
2473
2474         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2475                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2476
2477         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2478                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2479
2480         return 0;
2481 }
2482
2483 static int setup_image(char **device_path, int *loop_nr) {
2484         struct loop_info64 info = {
2485                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2486         };
2487         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2488         _cleanup_free_ char* loopdev = NULL;
2489         struct stat st;
2490         int r, nr;
2491
2492         assert(device_path);
2493         assert(loop_nr);
2494         assert(arg_image);
2495
2496         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2497         if (fd < 0)
2498                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2499
2500         if (fstat(fd, &st) < 0)
2501                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2502
2503         if (S_ISBLK(st.st_mode)) {
2504                 char *p;
2505
2506                 p = strdup(arg_image);
2507                 if (!p)
2508                         return log_oom();
2509
2510                 *device_path = p;
2511
2512                 *loop_nr = -1;
2513
2514                 r = fd;
2515                 fd = -1;
2516
2517                 return r;
2518         }
2519
2520         if (!S_ISREG(st.st_mode)) {
2521                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2522                 return -EINVAL;
2523         }
2524
2525         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2526         if (control < 0)
2527                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2528
2529         nr = ioctl(control, LOOP_CTL_GET_FREE);
2530         if (nr < 0)
2531                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2532
2533         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2534                 return log_oom();
2535
2536         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2537         if (loop < 0)
2538                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2539
2540         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2541                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2542
2543         if (arg_read_only)
2544                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2545
2546         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2547                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2548
2549         *device_path = loopdev;
2550         loopdev = NULL;
2551
2552         *loop_nr = nr;
2553
2554         r = loop;
2555         loop = -1;
2556
2557         return r;
2558 }
2559
2560 static int dissect_image(
2561                 int fd,
2562                 char **root_device, bool *root_device_rw,
2563                 char **home_device, bool *home_device_rw,
2564                 char **srv_device, bool *srv_device_rw,
2565                 bool *secondary) {
2566
2567 #ifdef HAVE_BLKID
2568         int home_nr = -1, srv_nr = -1;
2569 #ifdef GPT_ROOT_NATIVE
2570         int root_nr = -1;
2571 #endif
2572 #ifdef GPT_ROOT_SECONDARY
2573         int secondary_root_nr = -1;
2574 #endif
2575
2576         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2577         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2578         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2579         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2580         _cleanup_udev_unref_ struct udev *udev = NULL;
2581         struct udev_list_entry *first, *item;
2582         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2583         const char *pttype = NULL;
2584         blkid_partlist pl;
2585         struct stat st;
2586         int r;
2587
2588         assert(fd >= 0);
2589         assert(root_device);
2590         assert(home_device);
2591         assert(srv_device);
2592         assert(secondary);
2593         assert(arg_image);
2594
2595         b = blkid_new_probe();
2596         if (!b)
2597                 return log_oom();
2598
2599         errno = 0;
2600         r = blkid_probe_set_device(b, fd, 0, 0);
2601         if (r != 0) {
2602                 if (errno == 0)
2603                         return log_oom();
2604
2605                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2606                 return -errno;
2607         }
2608
2609         blkid_probe_enable_partitions(b, 1);
2610         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2611
2612         errno = 0;
2613         r = blkid_do_safeprobe(b);
2614         if (r == -2 || r == 1) {
2615                 log_error("Failed to identify any partition table on %s.\n"
2616                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2617                 return -EINVAL;
2618         } else if (r != 0) {
2619                 if (errno == 0)
2620                         errno = EIO;
2621                 log_error_errno(errno, "Failed to probe: %m");
2622                 return -errno;
2623         }
2624
2625         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2626         if (!streq_ptr(pttype, "gpt")) {
2627                 log_error("Image %s does not carry a GUID Partition Table.\n"
2628                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2629                 return -EINVAL;
2630         }
2631
2632         errno = 0;
2633         pl = blkid_probe_get_partitions(b);
2634         if (!pl) {
2635                 if (errno == 0)
2636                         return log_oom();
2637
2638                 log_error("Failed to list partitions of %s", arg_image);
2639                 return -errno;
2640         }
2641
2642         udev = udev_new();
2643         if (!udev)
2644                 return log_oom();
2645
2646         if (fstat(fd, &st) < 0)
2647                 return log_error_errno(errno, "Failed to stat block device: %m");
2648
2649         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2650         if (!d)
2651                 return log_oom();
2652
2653         e = udev_enumerate_new(udev);
2654         if (!e)
2655                 return log_oom();
2656
2657         r = udev_enumerate_add_match_parent(e, d);
2658         if (r < 0)
2659                 return log_oom();
2660
2661         r = udev_enumerate_scan_devices(e);
2662         if (r < 0)
2663                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2664
2665         first = udev_enumerate_get_list_entry(e);
2666         udev_list_entry_foreach(item, first) {
2667                 _cleanup_udev_device_unref_ struct udev_device *q;
2668                 const char *stype, *node;
2669                 unsigned long long flags;
2670                 sd_id128_t type_id;
2671                 blkid_partition pp;
2672                 dev_t qn;
2673                 int nr;
2674
2675                 errno = 0;
2676                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2677                 if (!q) {
2678                         if (!errno)
2679                                 errno = ENOMEM;
2680
2681                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2682                         return -errno;
2683                 }
2684
2685                 qn = udev_device_get_devnum(q);
2686                 if (major(qn) == 0)
2687                         continue;
2688
2689                 if (st.st_rdev == qn)
2690                         continue;
2691
2692                 node = udev_device_get_devnode(q);
2693                 if (!node)
2694                         continue;
2695
2696                 pp = blkid_partlist_devno_to_partition(pl, qn);
2697                 if (!pp)
2698                         continue;
2699
2700                 flags = blkid_partition_get_flags(pp);
2701                 if (flags & GPT_FLAG_NO_AUTO)
2702                         continue;
2703
2704                 nr = blkid_partition_get_partno(pp);
2705                 if (nr < 0)
2706                         continue;
2707
2708                 stype = blkid_partition_get_type_string(pp);
2709                 if (!stype)
2710                         continue;
2711
2712                 if (sd_id128_from_string(stype, &type_id) < 0)
2713                         continue;
2714
2715                 if (sd_id128_equal(type_id, GPT_HOME)) {
2716
2717                         if (home && nr >= home_nr)
2718                                 continue;
2719
2720                         home_nr = nr;
2721                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2722
2723                         free(home);
2724                         home = strdup(node);
2725                         if (!home)
2726                                 return log_oom();
2727                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2728
2729                         if (srv && nr >= srv_nr)
2730                                 continue;
2731
2732                         srv_nr = nr;
2733                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2734
2735                         free(srv);
2736                         srv = strdup(node);
2737                         if (!srv)
2738                                 return log_oom();
2739                 }
2740 #ifdef GPT_ROOT_NATIVE
2741                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2742
2743                         if (root && nr >= root_nr)
2744                                 continue;
2745
2746                         root_nr = nr;
2747                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2748
2749                         free(root);
2750                         root = strdup(node);
2751                         if (!root)
2752                                 return log_oom();
2753                 }
2754 #endif
2755 #ifdef GPT_ROOT_SECONDARY
2756                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2757
2758                         if (secondary_root && nr >= secondary_root_nr)
2759                                 continue;
2760
2761                         secondary_root_nr = nr;
2762                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2763
2764
2765                         free(secondary_root);
2766                         secondary_root = strdup(node);
2767                         if (!secondary_root)
2768                                 return log_oom();
2769                 }
2770 #endif
2771         }
2772
2773         if (!root && !secondary_root) {
2774                 log_error("Failed to identify root partition in disk image %s.\n"
2775                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2776                 return -EINVAL;
2777         }
2778
2779         if (root) {
2780                 *root_device = root;
2781                 root = NULL;
2782
2783                 *root_device_rw = root_rw;
2784                 *secondary = false;
2785         } else if (secondary_root) {
2786                 *root_device = secondary_root;
2787                 secondary_root = NULL;
2788
2789                 *root_device_rw = secondary_root_rw;
2790                 *secondary = true;
2791         }
2792
2793         if (home) {
2794                 *home_device = home;
2795                 home = NULL;
2796
2797                 *home_device_rw = home_rw;
2798         }
2799
2800         if (srv) {
2801                 *srv_device = srv;
2802                 srv = NULL;
2803
2804                 *srv_device_rw = srv_rw;
2805         }
2806
2807         return 0;
2808 #else
2809         log_error("--image= is not supported, compiled without blkid support.");
2810         return -ENOTSUP;
2811 #endif
2812 }
2813
2814 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2815 #ifdef HAVE_BLKID
2816         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2817         const char *fstype, *p;
2818         int r;
2819
2820         assert(what);
2821         assert(where);
2822
2823         if (arg_read_only)
2824                 rw = false;
2825
2826         if (directory)
2827                 p = strappenda(where, directory);
2828         else
2829                 p = where;
2830
2831         errno = 0;
2832         b = blkid_new_probe_from_filename(what);
2833         if (!b) {
2834                 if (errno == 0)
2835                         return log_oom();
2836                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2837                 return -errno;
2838         }
2839
2840         blkid_probe_enable_superblocks(b, 1);
2841         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2842
2843         errno = 0;
2844         r = blkid_do_safeprobe(b);
2845         if (r == -1 || r == 1) {
2846                 log_error("Cannot determine file system type of %s", what);
2847                 return -EINVAL;
2848         } else if (r != 0) {
2849                 if (errno == 0)
2850                         errno = EIO;
2851                 log_error_errno(errno, "Failed to probe %s: %m", what);
2852                 return -errno;
2853         }
2854
2855         errno = 0;
2856         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2857                 if (errno == 0)
2858                         errno = EINVAL;
2859                 log_error("Failed to determine file system type of %s", what);
2860                 return -errno;
2861         }
2862
2863         if (streq(fstype, "crypto_LUKS")) {
2864                 log_error("nspawn currently does not support LUKS disk images.");
2865                 return -ENOTSUP;
2866         }
2867
2868         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2869                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2870
2871         return 0;
2872 #else
2873         log_error("--image= is not supported, compiled without blkid support.");
2874         return -ENOTSUP;
2875 #endif
2876 }
2877
2878 static int mount_devices(
2879                 const char *where,
2880                 const char *root_device, bool root_device_rw,
2881                 const char *home_device, bool home_device_rw,
2882                 const char *srv_device, bool srv_device_rw) {
2883         int r;
2884
2885         assert(where);
2886
2887         if (root_device) {
2888                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2889                 if (r < 0)
2890                         return log_error_errno(r, "Failed to mount root directory: %m");
2891         }
2892
2893         if (home_device) {
2894                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2895                 if (r < 0)
2896                         return log_error_errno(r, "Failed to mount home directory: %m");
2897         }
2898
2899         if (srv_device) {
2900                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2901                 if (r < 0)
2902                         return log_error_errno(r, "Failed to mount server data directory: %m");
2903         }
2904
2905         return 0;
2906 }
2907
2908 static void loop_remove(int nr, int *image_fd) {
2909         _cleanup_close_ int control = -1;
2910         int r;
2911
2912         if (nr < 0)
2913                 return;
2914
2915         if (image_fd && *image_fd >= 0) {
2916                 r = ioctl(*image_fd, LOOP_CLR_FD);
2917                 if (r < 0)
2918                         log_warning_errno(errno, "Failed to close loop image: %m");
2919                 *image_fd = safe_close(*image_fd);
2920         }
2921
2922         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2923         if (control < 0) {
2924                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2925                 return;
2926         }
2927
2928         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2929         if (r < 0)
2930                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2931 }
2932
2933 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2934         int pipe_fds[2];
2935         pid_t pid;
2936
2937         assert(database);
2938         assert(key);
2939         assert(rpid);
2940
2941         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2942                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2943
2944         pid = fork();
2945         if (pid < 0)
2946                 return log_error_errno(errno, "Failed to fork getent child: %m");
2947         else if (pid == 0) {
2948                 int nullfd;
2949                 char *empty_env = NULL;
2950
2951                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2952                         _exit(EXIT_FAILURE);
2953
2954                 if (pipe_fds[0] > 2)
2955                         safe_close(pipe_fds[0]);
2956                 if (pipe_fds[1] > 2)
2957                         safe_close(pipe_fds[1]);
2958
2959                 nullfd = open("/dev/null", O_RDWR);
2960                 if (nullfd < 0)
2961                         _exit(EXIT_FAILURE);
2962
2963                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2964                         _exit(EXIT_FAILURE);
2965
2966                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2967                         _exit(EXIT_FAILURE);
2968
2969                 if (nullfd > 2)
2970                         safe_close(nullfd);
2971
2972                 reset_all_signal_handlers();
2973                 close_all_fds(NULL, 0);
2974
2975                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2976                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2977                 _exit(EXIT_FAILURE);
2978         }
2979
2980         pipe_fds[1] = safe_close(pipe_fds[1]);
2981
2982         *rpid = pid;
2983
2984         return pipe_fds[0];
2985 }
2986
2987 static int change_uid_gid(char **_home) {
2988         char line[LINE_MAX], *x, *u, *g, *h;
2989         const char *word, *state;
2990         _cleanup_free_ uid_t *uids = NULL;
2991         _cleanup_free_ char *home = NULL;
2992         _cleanup_fclose_ FILE *f = NULL;
2993         _cleanup_close_ int fd = -1;
2994         unsigned n_uids = 0;
2995         size_t sz = 0, l;
2996         uid_t uid;
2997         gid_t gid;
2998         pid_t pid;
2999         int r;
3000
3001         assert(_home);
3002
3003         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3004                 /* Reset everything fully to 0, just in case */
3005
3006                 if (setgroups(0, NULL) < 0)
3007                         return log_error_errno(errno, "setgroups() failed: %m");
3008
3009                 if (setresgid(0, 0, 0) < 0)
3010                         return log_error_errno(errno, "setregid() failed: %m");
3011
3012                 if (setresuid(0, 0, 0) < 0)
3013                         return log_error_errno(errno, "setreuid() failed: %m");
3014
3015                 *_home = NULL;
3016                 return 0;
3017         }
3018
3019         /* First, get user credentials */
3020         fd = spawn_getent("passwd", arg_user, &pid);
3021         if (fd < 0)
3022                 return fd;
3023
3024         f = fdopen(fd, "r");
3025         if (!f)
3026                 return log_oom();
3027         fd = -1;
3028
3029         if (!fgets(line, sizeof(line), f)) {
3030
3031                 if (!ferror(f)) {
3032                         log_error("Failed to resolve user %s.", arg_user);
3033                         return -ESRCH;
3034                 }
3035
3036                 log_error_errno(errno, "Failed to read from getent: %m");
3037                 return -errno;
3038         }
3039
3040         truncate_nl(line);
3041
3042         wait_for_terminate_and_warn("getent passwd", pid, true);
3043
3044         x = strchr(line, ':');
3045         if (!x) {
3046                 log_error("/etc/passwd entry has invalid user field.");
3047                 return -EIO;
3048         }
3049
3050         u = strchr(x+1, ':');
3051         if (!u) {
3052                 log_error("/etc/passwd entry has invalid password field.");
3053                 return -EIO;
3054         }
3055
3056         u++;
3057         g = strchr(u, ':');
3058         if (!g) {
3059                 log_error("/etc/passwd entry has invalid UID field.");
3060                 return -EIO;
3061         }
3062
3063         *g = 0;
3064         g++;
3065         x = strchr(g, ':');
3066         if (!x) {
3067                 log_error("/etc/passwd entry has invalid GID field.");
3068                 return -EIO;
3069         }
3070
3071         *x = 0;
3072         h = strchr(x+1, ':');
3073         if (!h) {
3074                 log_error("/etc/passwd entry has invalid GECOS field.");
3075                 return -EIO;
3076         }
3077
3078         h++;
3079         x = strchr(h, ':');
3080         if (!x) {
3081                 log_error("/etc/passwd entry has invalid home directory field.");
3082                 return -EIO;
3083         }
3084
3085         *x = 0;
3086
3087         r = parse_uid(u, &uid);
3088         if (r < 0) {
3089                 log_error("Failed to parse UID of user.");
3090                 return -EIO;
3091         }
3092
3093         r = parse_gid(g, &gid);
3094         if (r < 0) {
3095                 log_error("Failed to parse GID of user.");
3096                 return -EIO;
3097         }
3098
3099         home = strdup(h);
3100         if (!home)
3101                 return log_oom();
3102
3103         /* Second, get group memberships */
3104         fd = spawn_getent("initgroups", arg_user, &pid);
3105         if (fd < 0)
3106                 return fd;
3107
3108         fclose(f);
3109         f = fdopen(fd, "r");
3110         if (!f)
3111                 return log_oom();
3112         fd = -1;
3113
3114         if (!fgets(line, sizeof(line), f)) {
3115                 if (!ferror(f)) {
3116                         log_error("Failed to resolve user %s.", arg_user);
3117                         return -ESRCH;
3118                 }
3119
3120                 log_error_errno(errno, "Failed to read from getent: %m");
3121                 return -errno;
3122         }
3123
3124         truncate_nl(line);
3125
3126         wait_for_terminate_and_warn("getent initgroups", pid, true);
3127
3128         /* Skip over the username and subsequent separator whitespace */
3129         x = line;
3130         x += strcspn(x, WHITESPACE);
3131         x += strspn(x, WHITESPACE);
3132
3133         FOREACH_WORD(word, l, x, state) {
3134                 char c[l+1];
3135
3136                 memcpy(c, word, l);
3137                 c[l] = 0;
3138
3139                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3140                         return log_oom();
3141
3142                 r = parse_uid(c, &uids[n_uids++]);
3143                 if (r < 0) {
3144                         log_error("Failed to parse group data from getent.");
3145                         return -EIO;
3146                 }
3147         }
3148
3149         r = mkdir_parents(home, 0775);
3150         if (r < 0)
3151                 return log_error_errno(r, "Failed to make home root directory: %m");
3152
3153         r = mkdir_safe(home, 0755, uid, gid);
3154         if (r < 0 && r != -EEXIST)
3155                 return log_error_errno(r, "Failed to make home directory: %m");
3156
3157         fchown(STDIN_FILENO, uid, gid);
3158         fchown(STDOUT_FILENO, uid, gid);
3159         fchown(STDERR_FILENO, uid, gid);
3160
3161         if (setgroups(n_uids, uids) < 0)
3162                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3163
3164         if (setresgid(gid, gid, gid) < 0)
3165                 return log_error_errno(errno, "setregid() failed: %m");
3166
3167         if (setresuid(uid, uid, uid) < 0)
3168                 return log_error_errno(errno, "setreuid() failed: %m");
3169
3170         if (_home) {
3171                 *_home = home;
3172                 home = NULL;
3173         }
3174
3175         return 0;
3176 }
3177
3178 /*
3179  * Return values:
3180  * < 0 : wait_for_terminate() failed to get the state of the
3181  *       container, the container was terminated by a signal, or
3182  *       failed for an unknown reason.  No change is made to the
3183  *       container argument.
3184  * > 0 : The program executed in the container terminated with an
3185  *       error.  The exit code of the program executed in the
3186  *       container is returned.  The container argument has been set
3187  *       to CONTAINER_TERMINATED.
3188  *   0 : The container is being rebooted, has been shut down or exited
3189  *       successfully.  The container argument has been set to either
3190  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3191  *
3192  * That is, success is indicated by a return value of zero, and an
3193  * error is indicated by a non-zero value.
3194  */
3195 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3196         siginfo_t status;
3197         int r;
3198
3199         r = wait_for_terminate(pid, &status);
3200         if (r < 0)
3201                 return log_warning_errno(r, "Failed to wait for container: %m");
3202
3203         switch (status.si_code) {
3204
3205         case CLD_EXITED:
3206                 if (status.si_status == 0) {
3207                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3208
3209                 } else
3210                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3211
3212                 *container = CONTAINER_TERMINATED;
3213                 return status.si_status;
3214
3215         case CLD_KILLED:
3216                 if (status.si_status == SIGINT) {
3217
3218                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3219                         *container = CONTAINER_TERMINATED;
3220                         return 0;
3221
3222                 } else if (status.si_status == SIGHUP) {
3223
3224                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3225                         *container = CONTAINER_REBOOTED;
3226                         return 0;
3227                 }
3228
3229                 /* CLD_KILLED fallthrough */
3230
3231         case CLD_DUMPED:
3232                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3233                 return -EIO;
3234
3235         default:
3236                 log_error("Container %s failed due to unknown reason.", arg_machine);
3237                 return -EIO;
3238         }
3239
3240         return r;
3241 }
3242
3243 static void nop_handler(int sig) {}
3244
3245 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3246         pid_t pid;
3247
3248         pid = PTR_TO_UINT32(userdata);
3249         if (pid > 0) {
3250                 if (kill(pid, SIGRTMIN+3) >= 0) {
3251                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3252                         sd_event_source_set_userdata(s, NULL);
3253                         return 0;
3254                 }
3255         }
3256
3257         sd_event_exit(sd_event_source_get_event(s), 0);
3258         return 0;
3259 }
3260
3261 static int determine_names(void) {
3262         int r;
3263
3264         if (!arg_image && !arg_directory) {
3265                 if (arg_machine) {
3266                         _cleanup_(image_unrefp) Image *i = NULL;
3267
3268                         r = image_find(arg_machine, &i);
3269                         if (r < 0)
3270                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3271                         else if (r == 0) {
3272                                 log_error("No image for machine '%s': %m", arg_machine);
3273                                 return -ENOENT;
3274                         }
3275
3276                         if (i->type == IMAGE_GPT)
3277                                 r = set_sanitized_path(&arg_image, i->path);
3278                         else
3279                                 r = set_sanitized_path(&arg_directory, i->path);
3280                         if (r < 0)
3281                                 return log_error_errno(r, "Invalid image directory: %m");
3282
3283                         arg_read_only = arg_read_only || i->read_only;
3284                 } else
3285                         arg_directory = get_current_dir_name();
3286
3287                 if (!arg_directory && !arg_machine) {
3288                         log_error("Failed to determine path, please use -D or -i.");
3289                         return -EINVAL;
3290                 }
3291         }
3292
3293         if (!arg_machine) {
3294                 if (arg_directory && path_equal(arg_directory, "/"))
3295                         arg_machine = gethostname_malloc();
3296                 else
3297                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3298
3299                 if (!arg_machine)
3300                         return log_oom();
3301
3302                 hostname_cleanup(arg_machine, false);
3303                 if (!machine_name_is_valid(arg_machine)) {
3304                         log_error("Failed to determine machine name automatically, please use -M.");
3305                         return -EINVAL;
3306                 }
3307
3308                 if (arg_ephemeral) {
3309                         char *b;
3310
3311                         /* Add a random suffix when this is an
3312                          * ephemeral machine, so that we can run many
3313                          * instances at once without manually having
3314                          * to specify -M each time. */
3315
3316                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3317                                 return log_oom();
3318
3319                         free(arg_machine);
3320                         arg_machine = b;
3321                 }
3322         }
3323
3324         return 0;
3325 }
3326
3327 int main(int argc, char *argv[]) {
3328
3329         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3330         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3331         _cleanup_close_ int master = -1, image_fd = -1;
3332         _cleanup_fdset_free_ FDSet *fds = NULL;
3333         int r, n_fd_passed, loop_nr = -1;
3334         char veth_name[IFNAMSIZ];
3335         bool secondary = false, remove_subvol = false;
3336         sigset_t mask, mask_chld;
3337         pid_t pid = 0;
3338         int ret = EXIT_SUCCESS;
3339         union in_addr_union exposed = {};
3340         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3341
3342         log_parse_environment();
3343         log_open();
3344
3345         r = parse_argv(argc, argv);
3346         if (r <= 0)
3347                 goto finish;
3348
3349         r = determine_names();
3350         if (r < 0)
3351                 goto finish;
3352
3353         if (geteuid() != 0) {
3354                 log_error("Need to be root.");
3355                 r = -EPERM;
3356                 goto finish;
3357         }
3358
3359         if (sd_booted() <= 0) {
3360                 log_error("Not running on a systemd system.");
3361                 r = -EINVAL;
3362                 goto finish;
3363         }
3364
3365         log_close();
3366         n_fd_passed = sd_listen_fds(false);
3367         if (n_fd_passed > 0) {
3368                 r = fdset_new_listen_fds(&fds, false);
3369                 if (r < 0) {
3370                         log_error_errno(r, "Failed to collect file descriptors: %m");
3371                         goto finish;
3372                 }
3373         }
3374         fdset_close_others(fds);
3375         log_open();
3376
3377         if (arg_directory) {
3378                 assert(!arg_image);
3379
3380                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3381                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3382                         r = -EINVAL;
3383                         goto finish;
3384                 }
3385
3386                 if (arg_ephemeral) {
3387                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3388                         char *np;
3389
3390                         /* If the specified path is a mount point we
3391                          * generate the new snapshot immediately
3392                          * inside it under a random name. However if
3393                          * the specified is not a mount point we
3394                          * create the new snapshot in the parent
3395                          * directory, just next to it. */
3396                         r = path_is_mount_point(arg_directory, false);
3397                         if (r < 0) {
3398                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3399                                 goto finish;
3400                         }
3401                         if (r > 0)
3402                                 r = tempfn_random_child(arg_directory, &np);
3403                         else
3404                                 r = tempfn_random(arg_directory, &np);
3405                         if (r < 0) {
3406                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3407                                 goto finish;
3408                         }
3409
3410                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3411                         if (r < 0) {
3412                                 log_error_errno(r, "Failed to lock %s: %m", np);
3413                                 goto finish;
3414                         }
3415
3416                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3417                         if (r < 0) {
3418                                 free(np);
3419                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3420                                 goto finish;
3421                         }
3422
3423                         free(arg_directory);
3424                         arg_directory = np;
3425
3426                         remove_subvol = true;
3427
3428                 } else {
3429                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3430                         if (r == -EBUSY) {
3431                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3432                                 goto finish;
3433                         }
3434                         if (r < 0) {
3435                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3436                                 return r;
3437                         }
3438
3439                         if (arg_template) {
3440                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3441                                 if (r == -EEXIST) {
3442                                         if (!arg_quiet)
3443                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3444                                 } else if (r < 0) {
3445                                         log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3446                                         goto finish;
3447                                 } else {
3448                                         if (!arg_quiet)
3449                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3450                                 }
3451                         }
3452                 }
3453
3454                 if (arg_boot) {
3455                         if (path_is_os_tree(arg_directory) <= 0) {
3456                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3457                                 r = -EINVAL;
3458                                 goto finish;
3459                         }
3460                 } else {
3461                         const char *p;
3462
3463                         p = strappenda(arg_directory,
3464                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3465                         if (access(p, F_OK) < 0) {
3466                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3467                                 r = -EINVAL;
3468                                 goto finish;
3469                         }
3470                 }
3471
3472         } else {
3473                 char template[] = "/tmp/nspawn-root-XXXXXX";
3474
3475                 assert(arg_image);
3476                 assert(!arg_template);
3477
3478                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3479                 if (r == -EBUSY) {
3480                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3481                         goto finish;
3482                 }
3483                 if (r < 0) {
3484                         r = log_error_errno(r, "Failed to create image lock: %m");
3485                         goto finish;
3486                 }
3487
3488                 if (!mkdtemp(template)) {
3489                         log_error_errno(errno, "Failed to create temporary directory: %m");
3490                         r = -errno;
3491                         goto finish;
3492                 }
3493
3494                 arg_directory = strdup(template);
3495                 if (!arg_directory) {
3496                         r = log_oom();
3497                         goto finish;
3498                 }
3499
3500                 image_fd = setup_image(&device_path, &loop_nr);
3501                 if (image_fd < 0) {
3502                         r = image_fd;
3503                         goto finish;
3504                 }
3505
3506                 r = dissect_image(image_fd,
3507                                   &root_device, &root_device_rw,
3508                                   &home_device, &home_device_rw,
3509                                   &srv_device, &srv_device_rw,
3510                                   &secondary);
3511                 if (r < 0)
3512                         goto finish;
3513         }
3514
3515         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3516         if (master < 0) {
3517                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3518                 goto finish;
3519         }
3520
3521         r = ptsname_malloc(master, &console);
3522         if (r < 0) {
3523                 r = log_error_errno(r, "Failed to determine tty name: %m");
3524                 goto finish;
3525         }
3526
3527         if (!arg_quiet)
3528                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3529                          arg_machine, arg_image ?: arg_directory);
3530
3531         if (unlockpt(master) < 0) {
3532                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3533                 goto finish;
3534         }
3535
3536         assert_se(sigemptyset(&mask) == 0);
3537         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3538         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3539
3540         assert_se(sigemptyset(&mask_chld) == 0);
3541         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3542
3543         for (;;) {
3544                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3545                 ContainerStatus container_status;
3546                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3547                 struct sigaction sa = {
3548                         .sa_handler = nop_handler,
3549                         .sa_flags = SA_NOCLDSTOP,
3550                 };
3551
3552                 r = barrier_create(&barrier);
3553                 if (r < 0) {
3554                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3555                         goto finish;
3556                 }
3557
3558                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3559                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3560                         goto finish;
3561                 }
3562
3563                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3564                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3565                         goto finish;
3566                 }
3567
3568                 /* Child can be killed before execv(), so handle SIGCHLD
3569                  * in order to interrupt parent's blocking calls and
3570                  * give it a chance to call wait() and terminate. */
3571                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3572                 if (r < 0) {
3573                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3574                         goto finish;
3575                 }
3576
3577                 r = sigaction(SIGCHLD, &sa, NULL);
3578                 if (r < 0) {
3579                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3580                         goto finish;
3581                 }
3582
3583                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3584                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3585                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3586                 if (pid < 0) {
3587                         if (errno == EINVAL)
3588                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3589                         else
3590                                 r = log_error_errno(errno, "clone() failed: %m");
3591
3592                         goto finish;
3593                 }
3594
3595                 if (pid == 0) {
3596                         /* child */
3597                         _cleanup_free_ char *home = NULL;
3598                         unsigned n_env = 2;
3599                         const char *envp[] = {
3600                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3601                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3602                                 NULL, /* TERM */
3603                                 NULL, /* HOME */
3604                                 NULL, /* USER */
3605                                 NULL, /* LOGNAME */
3606                                 NULL, /* container_uuid */
3607                                 NULL, /* LISTEN_FDS */
3608                                 NULL, /* LISTEN_PID */
3609                                 NULL
3610                         };
3611                         char **env_use;
3612
3613                         barrier_set_role(&barrier, BARRIER_CHILD);
3614
3615                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3616                         if (envp[n_env])
3617                                 n_env ++;
3618
3619                         master = safe_close(master);
3620
3621                         close_nointr(STDIN_FILENO);
3622                         close_nointr(STDOUT_FILENO);
3623                         close_nointr(STDERR_FILENO);
3624
3625                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3626                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3627
3628                         reset_all_signal_handlers();
3629                         reset_signal_mask();
3630
3631                         r = open_terminal(console, O_RDWR);
3632                         if (r != STDIN_FILENO) {
3633                                 if (r >= 0) {
3634                                         safe_close(r);
3635                                         r = -EINVAL;
3636                                 }
3637
3638                                 log_error_errno(r, "Failed to open console: %m");
3639                                 _exit(EXIT_FAILURE);
3640                         }
3641
3642                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3643                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3644                                 log_error_errno(errno, "Failed to duplicate console: %m");
3645                                 _exit(EXIT_FAILURE);
3646                         }
3647
3648                         if (setsid() < 0) {
3649                                 log_error_errno(errno, "setsid() failed: %m");
3650                                 _exit(EXIT_FAILURE);
3651                         }
3652
3653                         if (reset_audit_loginuid() < 0)
3654                                 _exit(EXIT_FAILURE);
3655
3656                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3657                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3658                                 _exit(EXIT_FAILURE);
3659                         }
3660
3661                         /* Mark everything as slave, so that we still
3662                          * receive mounts from the real root, but don't
3663                          * propagate mounts to the real root. */
3664                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3665                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3666                                 _exit(EXIT_FAILURE);
3667                         }
3668
3669                         if (mount_devices(arg_directory,
3670                                           root_device, root_device_rw,
3671                                           home_device, home_device_rw,
3672                                           srv_device, srv_device_rw) < 0)
3673                                 _exit(EXIT_FAILURE);
3674
3675                         /* Turn directory into bind mount */
3676                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3677                                 log_error_errno(errno, "Failed to make bind mount: %m");
3678                                 _exit(EXIT_FAILURE);
3679                         }
3680
3681                         r = setup_volatile(arg_directory);
3682                         if (r < 0)
3683                                 _exit(EXIT_FAILURE);
3684
3685                         if (setup_volatile_state(arg_directory) < 0)
3686                                 _exit(EXIT_FAILURE);
3687
3688                         r = base_filesystem_create(arg_directory);
3689                         if (r < 0)
3690                                 _exit(EXIT_FAILURE);
3691
3692                         if (arg_read_only) {
3693                                 r = bind_remount_recursive(arg_directory, true);
3694                                 if (r < 0) {
3695                                         log_error_errno(r, "Failed to make tree read-only: %m");
3696                                         _exit(EXIT_FAILURE);
3697                                 }
3698                         }
3699
3700                         if (mount_all(arg_directory) < 0)
3701                                 _exit(EXIT_FAILURE);
3702
3703                         if (copy_devnodes(arg_directory) < 0)
3704                                 _exit(EXIT_FAILURE);
3705
3706                         if (setup_ptmx(arg_directory) < 0)
3707                                 _exit(EXIT_FAILURE);
3708
3709                         dev_setup(arg_directory);
3710
3711                         if (setup_propagate(arg_directory) < 0)
3712                                 _exit(EXIT_FAILURE);
3713
3714                         if (setup_seccomp() < 0)
3715                                 _exit(EXIT_FAILURE);
3716
3717                         if (setup_dev_console(arg_directory, console) < 0)
3718                                 _exit(EXIT_FAILURE);
3719
3720                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3721                                 _exit(EXIT_FAILURE);
3722                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3723
3724                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3725                                 _exit(EXIT_FAILURE);
3726                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3727
3728                         /* Tell the parent that we are ready, and that
3729                          * it can cgroupify us to that we lack access
3730                          * to certain devices and resources. */
3731                         (void) barrier_place(&barrier);
3732
3733                         if (setup_boot_id(arg_directory) < 0)
3734                                 _exit(EXIT_FAILURE);
3735
3736                         if (setup_timezone(arg_directory) < 0)
3737                                 _exit(EXIT_FAILURE);
3738
3739                         if (setup_resolv_conf(arg_directory) < 0)
3740                                 _exit(EXIT_FAILURE);
3741
3742                         if (setup_journal(arg_directory) < 0)
3743                                 _exit(EXIT_FAILURE);
3744
3745                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3746                                 _exit(EXIT_FAILURE);
3747
3748                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3749                                 _exit(EXIT_FAILURE);
3750
3751                         if (mount_tmpfs(arg_directory) < 0)
3752                                 _exit(EXIT_FAILURE);
3753
3754                         /* Wait until we are cgroup-ified, so that we
3755                          * can mount the right cgroup path writable */
3756                         (void) barrier_sync_next(&barrier);
3757
3758                         if (mount_cgroup(arg_directory) < 0)
3759                                 _exit(EXIT_FAILURE);
3760
3761                         if (chdir(arg_directory) < 0) {
3762                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3763                                 _exit(EXIT_FAILURE);
3764                         }
3765
3766                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3767                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3768                                 _exit(EXIT_FAILURE);
3769                         }
3770
3771                         if (chroot(".") < 0) {
3772                                 log_error_errno(errno, "chroot() failed: %m");
3773                                 _exit(EXIT_FAILURE);
3774                         }
3775
3776                         if (chdir("/") < 0) {
3777                                 log_error_errno(errno, "chdir() failed: %m");
3778                                 _exit(EXIT_FAILURE);
3779                         }
3780
3781                         umask(0022);
3782
3783                         if (arg_private_network)
3784                                 loopback_setup();
3785
3786                         if (drop_capabilities() < 0) {
3787                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3788                                 _exit(EXIT_FAILURE);
3789                         }
3790
3791                         r = change_uid_gid(&home);
3792                         if (r < 0)
3793                                 _exit(EXIT_FAILURE);
3794
3795                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3796                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3797                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3798                                 log_oom();
3799                                 _exit(EXIT_FAILURE);
3800                         }
3801
3802                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3803                                 char as_uuid[37];
3804
3805                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3806                                         log_oom();
3807                                         _exit(EXIT_FAILURE);
3808                                 }
3809                         }
3810
3811                         if (fdset_size(fds) > 0) {
3812                                 r = fdset_cloexec(fds, false);
3813                                 if (r < 0) {
3814                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3815                                         _exit(EXIT_FAILURE);
3816                                 }
3817
3818                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3819                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3820                                         log_oom();
3821                                         _exit(EXIT_FAILURE);
3822                                 }
3823                         }
3824
3825                         setup_hostname();
3826
3827                         if (arg_personality != 0xffffffffLU) {
3828                                 if (personality(arg_personality) < 0) {
3829                                         log_error_errno(errno, "personality() failed: %m");
3830                                         _exit(EXIT_FAILURE);
3831                                 }
3832                         } else if (secondary) {
3833                                 if (personality(PER_LINUX32) < 0) {
3834                                         log_error_errno(errno, "personality() failed: %m");
3835                                         _exit(EXIT_FAILURE);
3836                                 }
3837                         }
3838
3839 #ifdef HAVE_SELINUX
3840                         if (arg_selinux_context)
3841                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3842                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3843                                         _exit(EXIT_FAILURE);
3844                                 }
3845 #endif
3846
3847                         if (!strv_isempty(arg_setenv)) {
3848                                 char **n;
3849
3850                                 n = strv_env_merge(2, envp, arg_setenv);
3851                                 if (!n) {
3852                                         log_oom();
3853                                         _exit(EXIT_FAILURE);
3854                                 }
3855
3856                                 env_use = n;
3857                         } else
3858                                 env_use = (char**) envp;
3859
3860                         /* Wait until the parent is ready with the setup, too... */
3861                         if (!barrier_place_and_sync(&barrier))
3862                                 _exit(EXIT_FAILURE);
3863
3864                         if (arg_boot) {
3865                                 char **a;
3866                                 size_t l;
3867
3868                                 /* Automatically search for the init system */
3869
3870                                 l = 1 + argc - optind;
3871                                 a = newa(char*, l + 1);
3872                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3873
3874                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3875                                 execve(a[0], a, env_use);
3876
3877                                 a[0] = (char*) "/lib/systemd/systemd";
3878                                 execve(a[0], a, env_use);
3879
3880                                 a[0] = (char*) "/sbin/init";
3881                                 execve(a[0], a, env_use);
3882                         } else if (argc > optind)
3883                                 execvpe(argv[optind], argv + optind, env_use);
3884                         else {
3885                                 chdir(home ? home : "/root");
3886                                 execle("/bin/bash", "-bash", NULL, env_use);
3887                                 execle("/bin/sh", "-sh", NULL, env_use);
3888                         }
3889
3890                         log_error_errno(errno, "execv() failed: %m");
3891                         _exit(EXIT_FAILURE);
3892                 }
3893
3894                 barrier_set_role(&barrier, BARRIER_PARENT);
3895                 fdset_free(fds);
3896                 fds = NULL;
3897
3898                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3899                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3900
3901                 /* Wait for the most basic Child-setup to be done,
3902                  * before we add hardware to it, and place it in a
3903                  * cgroup. */
3904                 if (barrier_sync_next(&barrier)) {
3905                         int ifi = 0;
3906
3907                         r = move_network_interfaces(pid);
3908                         if (r < 0)
3909                                 goto finish;
3910
3911                         r = setup_veth(pid, veth_name, &ifi);
3912                         if (r < 0)
3913                                 goto finish;
3914
3915                         r = setup_bridge(veth_name, &ifi);
3916                         if (r < 0)
3917                                 goto finish;
3918
3919                         r = setup_macvlan(pid);
3920                         if (r < 0)
3921                                 goto finish;
3922
3923                         r = register_machine(pid, ifi);
3924                         if (r < 0)
3925                                 goto finish;
3926
3927                         /* Block SIGCHLD here, before notifying child.
3928                          * process_pty() will handle it with the other signals. */
3929                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3930                         if (r < 0)
3931                                 goto finish;
3932
3933                         /* Reset signal to default */
3934                         r = default_signals(SIGCHLD, -1);
3935                         if (r < 0)
3936                                 goto finish;
3937
3938                         /* Notify the child that the parent is ready with all
3939                          * its setup, and that the child can now hand over
3940                          * control to the code to run inside the container. */
3941                         (void) barrier_place(&barrier);
3942
3943                         /* And wait that the child is completely ready now. */
3944                         if (barrier_place_and_sync(&barrier)) {
3945                                 _cleanup_event_unref_ sd_event *event = NULL;
3946                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3947                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
3948                                 char last_char = 0;
3949
3950                                 sd_notifyf(false,
3951                                            "READY=1\n"
3952                                            "STATUS=Container running.\n"
3953                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
3954
3955                                 r = sd_event_new(&event);
3956                                 if (r < 0) {
3957                                         log_error_errno(r, "Failed to get default event source: %m");
3958                                         goto finish;
3959                                 }
3960
3961                                 if (arg_boot) {
3962                                         /* Try to kill the init system on SIGINT or SIGTERM */
3963                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3964                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3965                                 } else {
3966                                         /* Immediately exit */
3967                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3968                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3969                                 }
3970
3971                                 /* simply exit on sigchld */
3972                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3973
3974                                 if (arg_expose_ports) {
3975                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
3976                                         if (r < 0)
3977                                                 goto finish;
3978
3979                                         (void) expose_ports(rtnl, &exposed);
3980                                 }
3981
3982                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3983
3984                                 r = pty_forward_new(event, master, true, &forward);
3985                                 if (r < 0) {
3986                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
3987                                         goto finish;
3988                                 }
3989
3990                                 r = sd_event_loop(event);
3991                                 if (r < 0) {
3992                                         log_error_errno(r, "Failed to run event loop: %m");
3993                                         goto finish;
3994                                 }
3995
3996                                 pty_forward_get_last_char(forward, &last_char);
3997
3998                                 forward = pty_forward_free(forward);
3999
4000                                 if (!arg_quiet && last_char != '\n')
4001                                         putc('\n', stdout);
4002
4003                                 /* Kill if it is not dead yet anyway */
4004                                 terminate_machine(pid);
4005                         }
4006                 }
4007
4008                 /* Normally redundant, but better safe than sorry */
4009                 kill(pid, SIGKILL);
4010
4011                 r = wait_for_container(pid, &container_status);
4012                 pid = 0;
4013
4014                 if (r < 0)
4015                         /* We failed to wait for the container, or the
4016                          * container exited abnormally */
4017                         goto finish;
4018                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4019                         /* The container exited with a non-zero
4020                          * status, or with zero status and no reboot
4021                          * was requested. */
4022                         ret = r;
4023                         break;
4024                 }
4025
4026                 /* CONTAINER_REBOOTED, loop again */
4027
4028                 if (arg_keep_unit) {
4029                         /* Special handling if we are running as a
4030                          * service: instead of simply restarting the
4031                          * machine we want to restart the entire
4032                          * service, so let's inform systemd about this
4033                          * with the special exit code 133. The service
4034                          * file uses RestartForceExitStatus=133 so
4035                          * that this results in a full nspawn
4036                          * restart. This is necessary since we might
4037                          * have cgroup parameters set we want to have
4038                          * flushed out. */
4039                         ret = 133;
4040                         r = 0;
4041                         break;
4042                 }
4043
4044                 flush_ports(&exposed);
4045         }
4046
4047 finish:
4048         sd_notify(false,
4049                   "STOPPING=1\n"
4050                   "STATUS=Terminating...");
4051
4052         loop_remove(loop_nr, &image_fd);
4053
4054         if (pid > 0)
4055                 kill(pid, SIGKILL);
4056
4057         if (remove_subvol && arg_directory) {
4058                 int k;
4059
4060                 k = btrfs_subvol_remove(arg_directory);
4061                 if (k < 0)
4062                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4063         }
4064
4065         if (arg_machine) {
4066                 const char *p;
4067
4068                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4069                 (void) rm_rf(p, false, true, false);
4070         }
4071
4072         free(arg_directory);
4073         free(arg_template);
4074         free(arg_image);
4075         free(arg_machine);
4076         free(arg_user);
4077         strv_free(arg_setenv);
4078         strv_free(arg_network_interfaces);
4079         strv_free(arg_network_macvlan);
4080         strv_free(arg_bind);
4081         strv_free(arg_bind_ro);
4082         strv_free(arg_tmpfs);
4083
4084         flush_ports(&exposed);
4085
4086         while (arg_expose_ports) {
4087                 ExposePort *p = arg_expose_ports;
4088                 LIST_REMOVE(ports, arg_expose_ports, p);
4089                 free(p);
4090         }
4091
4092         return r < 0 ? EXIT_FAILURE : ret;
4093 }