chiark / gitweb /
spawn: downgrade loopback detach errors to debug
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #ifdef HAVE_BLKID
56 #include <blkid/blkid.h>
57 #endif
58
59 #include "sd-daemon.h"
60 #include "sd-bus.h"
61 #include "sd-id128.h"
62 #include "sd-rtnl.h"
63 #include "log.h"
64 #include "util.h"
65 #include "mkdir.h"
66 #include "macro.h"
67 #include "audit.h"
68 #include "missing.h"
69 #include "cgroup-util.h"
70 #include "strv.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
74 #include "fdset.h"
75 #include "build.h"
76 #include "fileio.h"
77 #include "bus-util.h"
78 #include "bus-error.h"
79 #include "ptyfwd.h"
80 #include "bus-kernel.h"
81 #include "env-util.h"
82 #include "def.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
86 #include "gpt.h"
87 #include "siphash24.h"
88 #include "copy.h"
89 #include "base-filesystem.h"
90 #include "barrier.h"
91 #include "event-util.h"
92 #include "capability.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
96 #include "list.h"
97 #include "in-addr-util.h"
98 #include "fw-util.h"
99 #include "local-addresses.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106         int protocol;
107         uint16_t host_port;
108         uint16_t container_port;
109         LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113         CONTAINER_TERMINATED,
114         CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118         LINK_NO,
119         LINK_AUTO,
120         LINK_HOST,
121         LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125         VOLATILE_NO,
126         VOLATILE_YES,
127         VOLATILE_STATE,
128 } Volatile;
129
130 static char *arg_directory = NULL;
131 static char *arg_template = NULL;
132 static char *arg_user = NULL;
133 static sd_id128_t arg_uuid = {};
134 static char *arg_machine = NULL;
135 static const char *arg_selinux_context = NULL;
136 static const char *arg_selinux_apifs_context = NULL;
137 static const char *arg_slice = NULL;
138 static bool arg_private_network = false;
139 static bool arg_read_only = false;
140 static bool arg_boot = false;
141 static bool arg_ephemeral = false;
142 static LinkJournal arg_link_journal = LINK_AUTO;
143 static bool arg_link_journal_try = false;
144 static uint64_t arg_retain =
145         (1ULL << CAP_CHOWN) |
146         (1ULL << CAP_DAC_OVERRIDE) |
147         (1ULL << CAP_DAC_READ_SEARCH) |
148         (1ULL << CAP_FOWNER) |
149         (1ULL << CAP_FSETID) |
150         (1ULL << CAP_IPC_OWNER) |
151         (1ULL << CAP_KILL) |
152         (1ULL << CAP_LEASE) |
153         (1ULL << CAP_LINUX_IMMUTABLE) |
154         (1ULL << CAP_NET_BIND_SERVICE) |
155         (1ULL << CAP_NET_BROADCAST) |
156         (1ULL << CAP_NET_RAW) |
157         (1ULL << CAP_SETGID) |
158         (1ULL << CAP_SETFCAP) |
159         (1ULL << CAP_SETPCAP) |
160         (1ULL << CAP_SETUID) |
161         (1ULL << CAP_SYS_ADMIN) |
162         (1ULL << CAP_SYS_CHROOT) |
163         (1ULL << CAP_SYS_NICE) |
164         (1ULL << CAP_SYS_PTRACE) |
165         (1ULL << CAP_SYS_TTY_CONFIG) |
166         (1ULL << CAP_SYS_RESOURCE) |
167         (1ULL << CAP_SYS_BOOT) |
168         (1ULL << CAP_AUDIT_WRITE) |
169         (1ULL << CAP_AUDIT_CONTROL) |
170         (1ULL << CAP_MKNOD);
171 static char **arg_bind = NULL;
172 static char **arg_bind_ro = NULL;
173 static char **arg_tmpfs = NULL;
174 static char **arg_setenv = NULL;
175 static bool arg_quiet = false;
176 static bool arg_share_system = false;
177 static bool arg_register = true;
178 static bool arg_keep_unit = false;
179 static char **arg_network_interfaces = NULL;
180 static char **arg_network_macvlan = NULL;
181 static bool arg_network_veth = false;
182 static const char *arg_network_bridge = NULL;
183 static unsigned long arg_personality = 0xffffffffLU;
184 static char *arg_image = NULL;
185 static Volatile arg_volatile = VOLATILE_NO;
186 static ExposePort *arg_expose_ports = NULL;
187
188 static void help(void) {
189         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
190                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
191                "  -h --help                 Show this help\n"
192                "     --version              Print version string\n"
193                "  -q --quiet                Do not show status information\n"
194                "  -D --directory=PATH       Root directory for the container\n"
195                "     --template=PATH        Initialize root directory from template directory,\n"
196                "                            if missing\n"
197                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
198                "                            remove it after exit\n"
199                "  -i --image=PATH           File system device or disk image for the container\n"
200                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
201                "  -u --user=USER            Run the command under specified user or uid\n"
202                "  -M --machine=NAME         Set the machine name for the container\n"
203                "     --uuid=UUID            Set a specific machine UUID for the container\n"
204                "  -S --slice=SLICE          Place the container in the specified slice\n"
205                "     --private-network      Disable network in container\n"
206                "     --network-interface=INTERFACE\n"
207                "                            Assign an existing network interface to the\n"
208                "                            container\n"
209                "     --network-macvlan=INTERFACE\n"
210                "                            Create a macvlan network interface based on an\n"
211                "                            existing network interface to the container\n"
212                "  -n --network-veth         Add a virtual ethernet connection between host\n"
213                "                            and container\n"
214                "     --network-bridge=INTERFACE\n"
215                "                            Add a virtual ethernet connection between host\n"
216                "                            and container and add it to an existing bridge on\n"
217                "                            the host\n"
218                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
219                "                            Expose a container IP port on the host\n"
220                "  -Z --selinux-context=SECLABEL\n"
221                "                            Set the SELinux security context to be used by\n"
222                "                            processes in the container\n"
223                "  -L --selinux-apifs-context=SECLABEL\n"
224                "                            Set the SELinux security context to be used by\n"
225                "                            API/tmpfs file systems in the container\n"
226                "     --capability=CAP       In addition to the default, retain specified\n"
227                "                            capability\n"
228                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
229                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
230                "                            try-guest, try-host\n"
231                "  -j                        Equivalent to --link-journal=try-guest\n"
232                "     --read-only            Mount the root directory read-only\n"
233                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
234                "                            the container\n"
235                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
236                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
237                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
238                "     --share-system         Share system namespaces with host\n"
239                "     --register=BOOLEAN     Register container as machine\n"
240                "     --keep-unit            Do not register a scope for the machine, reuse\n"
241                "                            the service unit nspawn is running in\n"
242                "     --volatile[=MODE]      Run the system in volatile mode\n"
243                , program_invocation_short_name);
244 }
245
246 static int set_sanitized_path(char **b, const char *path) {
247         char *p;
248
249         assert(b);
250         assert(path);
251
252         p = canonicalize_file_name(path);
253         if (!p) {
254                 if (errno != ENOENT)
255                         return -errno;
256
257                 p = path_make_absolute_cwd(path);
258                 if (!p)
259                         return -ENOMEM;
260         }
261
262         free(*b);
263         *b = path_kill_slashes(p);
264         return 0;
265 }
266
267 static int parse_argv(int argc, char *argv[]) {
268
269         enum {
270                 ARG_VERSION = 0x100,
271                 ARG_PRIVATE_NETWORK,
272                 ARG_UUID,
273                 ARG_READ_ONLY,
274                 ARG_CAPABILITY,
275                 ARG_DROP_CAPABILITY,
276                 ARG_LINK_JOURNAL,
277                 ARG_BIND,
278                 ARG_BIND_RO,
279                 ARG_TMPFS,
280                 ARG_SETENV,
281                 ARG_SHARE_SYSTEM,
282                 ARG_REGISTER,
283                 ARG_KEEP_UNIT,
284                 ARG_NETWORK_INTERFACE,
285                 ARG_NETWORK_MACVLAN,
286                 ARG_NETWORK_BRIDGE,
287                 ARG_PERSONALITY,
288                 ARG_VOLATILE,
289                 ARG_TEMPLATE,
290         };
291
292         static const struct option options[] = {
293                 { "help",                  no_argument,       NULL, 'h'                   },
294                 { "version",               no_argument,       NULL, ARG_VERSION           },
295                 { "directory",             required_argument, NULL, 'D'                   },
296                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
297                 { "ephemeral",             no_argument,       NULL, 'x'                   },
298                 { "user",                  required_argument, NULL, 'u'                   },
299                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
300                 { "boot",                  no_argument,       NULL, 'b'                   },
301                 { "uuid",                  required_argument, NULL, ARG_UUID              },
302                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
303                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
304                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
305                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
306                 { "bind",                  required_argument, NULL, ARG_BIND              },
307                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
308                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
309                 { "machine",               required_argument, NULL, 'M'                   },
310                 { "slice",                 required_argument, NULL, 'S'                   },
311                 { "setenv",                required_argument, NULL, ARG_SETENV            },
312                 { "selinux-context",       required_argument, NULL, 'Z'                   },
313                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
314                 { "quiet",                 no_argument,       NULL, 'q'                   },
315                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
316                 { "register",              required_argument, NULL, ARG_REGISTER          },
317                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
318                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
319                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
320                 { "network-veth",          no_argument,       NULL, 'n'                   },
321                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
322                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
323                 { "image",                 required_argument, NULL, 'i'                   },
324                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
325                 { "port",                  required_argument, NULL, 'p'                   },
326                 {}
327         };
328
329         int c, r;
330         uint64_t plus = 0, minus = 0;
331
332         assert(argc >= 0);
333         assert(argv);
334
335         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
336
337                 switch (c) {
338
339                 case 'h':
340                         help();
341                         return 0;
342
343                 case ARG_VERSION:
344                         puts(PACKAGE_STRING);
345                         puts(SYSTEMD_FEATURES);
346                         return 0;
347
348                 case 'D':
349                         r = set_sanitized_path(&arg_directory, optarg);
350                         if (r < 0)
351                                 return log_error_errno(r, "Invalid root directory: %m");
352
353                         break;
354
355                 case ARG_TEMPLATE:
356                         r = set_sanitized_path(&arg_template, optarg);
357                         if (r < 0)
358                                 return log_error_errno(r, "Invalid template directory: %m");
359
360                         break;
361
362                 case 'i':
363                         r = set_sanitized_path(&arg_image, optarg);
364                         if (r < 0)
365                                 return log_error_errno(r, "Invalid image path: %m");
366
367                         break;
368
369                 case 'x':
370                         arg_ephemeral = true;
371                         break;
372
373                 case 'u':
374                         free(arg_user);
375                         arg_user = strdup(optarg);
376                         if (!arg_user)
377                                 return log_oom();
378
379                         break;
380
381                 case ARG_NETWORK_BRIDGE:
382                         arg_network_bridge = optarg;
383
384                         /* fall through */
385
386                 case 'n':
387                         arg_network_veth = true;
388                         arg_private_network = true;
389                         break;
390
391                 case ARG_NETWORK_INTERFACE:
392                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
393                                 return log_oom();
394
395                         arg_private_network = true;
396                         break;
397
398                 case ARG_NETWORK_MACVLAN:
399                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
400                                 return log_oom();
401
402                         /* fall through */
403
404                 case ARG_PRIVATE_NETWORK:
405                         arg_private_network = true;
406                         break;
407
408                 case 'b':
409                         arg_boot = true;
410                         break;
411
412                 case ARG_UUID:
413                         r = sd_id128_from_string(optarg, &arg_uuid);
414                         if (r < 0) {
415                                 log_error("Invalid UUID: %s", optarg);
416                                 return r;
417                         }
418                         break;
419
420                 case 'S':
421                         arg_slice = optarg;
422                         break;
423
424                 case 'M':
425                         if (isempty(optarg)) {
426                                 free(arg_machine);
427                                 arg_machine = NULL;
428                         } else {
429                                 if (!machine_name_is_valid(optarg)) {
430                                         log_error("Invalid machine name: %s", optarg);
431                                         return -EINVAL;
432                                 }
433
434                                 r = free_and_strdup(&arg_machine, optarg);
435                                 if (r < 0)
436                                         return log_oom();
437
438                                 break;
439                         }
440
441                 case 'Z':
442                         arg_selinux_context = optarg;
443                         break;
444
445                 case 'L':
446                         arg_selinux_apifs_context = optarg;
447                         break;
448
449                 case ARG_READ_ONLY:
450                         arg_read_only = true;
451                         break;
452
453                 case ARG_CAPABILITY:
454                 case ARG_DROP_CAPABILITY: {
455                         const char *state, *word;
456                         size_t length;
457
458                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
459                                 _cleanup_free_ char *t;
460
461                                 t = strndup(word, length);
462                                 if (!t)
463                                         return log_oom();
464
465                                 if (streq(t, "all")) {
466                                         if (c == ARG_CAPABILITY)
467                                                 plus = (uint64_t) -1;
468                                         else
469                                                 minus = (uint64_t) -1;
470                                 } else {
471                                         int cap;
472
473                                         cap = capability_from_name(t);
474                                         if (cap < 0) {
475                                                 log_error("Failed to parse capability %s.", t);
476                                                 return -EINVAL;
477                                         }
478
479                                         if (c == ARG_CAPABILITY)
480                                                 plus |= 1ULL << (uint64_t) cap;
481                                         else
482                                                 minus |= 1ULL << (uint64_t) cap;
483                                 }
484                         }
485
486                         break;
487                 }
488
489                 case 'j':
490                         arg_link_journal = LINK_GUEST;
491                         arg_link_journal_try = true;
492                         break;
493
494                 case ARG_LINK_JOURNAL:
495                         if (streq(optarg, "auto")) {
496                                 arg_link_journal = LINK_AUTO;
497                                 arg_link_journal_try = false;
498                         } else if (streq(optarg, "no")) {
499                                 arg_link_journal = LINK_NO;
500                                 arg_link_journal_try = false;
501                         } else if (streq(optarg, "guest")) {
502                                 arg_link_journal = LINK_GUEST;
503                                 arg_link_journal_try = false;
504                         } else if (streq(optarg, "host")) {
505                                 arg_link_journal = LINK_HOST;
506                                 arg_link_journal_try = false;
507                         } else if (streq(optarg, "try-guest")) {
508                                 arg_link_journal = LINK_GUEST;
509                                 arg_link_journal_try = true;
510                         } else if (streq(optarg, "try-host")) {
511                                 arg_link_journal = LINK_HOST;
512                                 arg_link_journal_try = true;
513                         } else {
514                                 log_error("Failed to parse link journal mode %s", optarg);
515                                 return -EINVAL;
516                         }
517
518                         break;
519
520                 case ARG_BIND:
521                 case ARG_BIND_RO: {
522                         _cleanup_free_ char *a = NULL, *b = NULL;
523                         char *e;
524                         char ***x;
525
526                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
527
528                         e = strchr(optarg, ':');
529                         if (e) {
530                                 a = strndup(optarg, e - optarg);
531                                 b = strdup(e + 1);
532                         } else {
533                                 a = strdup(optarg);
534                                 b = strdup(optarg);
535                         }
536
537                         if (!a || !b)
538                                 return log_oom();
539
540                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
541                                 log_error("Invalid bind mount specification: %s", optarg);
542                                 return -EINVAL;
543                         }
544
545                         r = strv_extend(x, a);
546                         if (r < 0)
547                                 return log_oom();
548
549                         r = strv_extend(x, b);
550                         if (r < 0)
551                                 return log_oom();
552
553                         break;
554                 }
555
556                 case ARG_TMPFS: {
557                         _cleanup_free_ char *a = NULL, *b = NULL;
558                         char *e;
559
560                         e = strchr(optarg, ':');
561                         if (e) {
562                                 a = strndup(optarg, e - optarg);
563                                 b = strdup(e + 1);
564                         } else {
565                                 a = strdup(optarg);
566                                 b = strdup("mode=0755");
567                         }
568
569                         if (!a || !b)
570                                 return log_oom();
571
572                         if (!path_is_absolute(a)) {
573                                 log_error("Invalid tmpfs specification: %s", optarg);
574                                 return -EINVAL;
575                         }
576
577                         r = strv_push(&arg_tmpfs, a);
578                         if (r < 0)
579                                 return log_oom();
580
581                         a = NULL;
582
583                         r = strv_push(&arg_tmpfs, b);
584                         if (r < 0)
585                                 return log_oom();
586
587                         b = NULL;
588
589                         break;
590                 }
591
592                 case ARG_SETENV: {
593                         char **n;
594
595                         if (!env_assignment_is_valid(optarg)) {
596                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
597                                 return -EINVAL;
598                         }
599
600                         n = strv_env_set(arg_setenv, optarg);
601                         if (!n)
602                                 return log_oom();
603
604                         strv_free(arg_setenv);
605                         arg_setenv = n;
606                         break;
607                 }
608
609                 case 'q':
610                         arg_quiet = true;
611                         break;
612
613                 case ARG_SHARE_SYSTEM:
614                         arg_share_system = true;
615                         break;
616
617                 case ARG_REGISTER:
618                         r = parse_boolean(optarg);
619                         if (r < 0) {
620                                 log_error("Failed to parse --register= argument: %s", optarg);
621                                 return r;
622                         }
623
624                         arg_register = r;
625                         break;
626
627                 case ARG_KEEP_UNIT:
628                         arg_keep_unit = true;
629                         break;
630
631                 case ARG_PERSONALITY:
632
633                         arg_personality = personality_from_string(optarg);
634                         if (arg_personality == 0xffffffffLU) {
635                                 log_error("Unknown or unsupported personality '%s'.", optarg);
636                                 return -EINVAL;
637                         }
638
639                         break;
640
641                 case ARG_VOLATILE:
642
643                         if (!optarg)
644                                 arg_volatile = VOLATILE_YES;
645                         else {
646                                 r = parse_boolean(optarg);
647                                 if (r < 0) {
648                                         if (streq(optarg, "state"))
649                                                 arg_volatile = VOLATILE_STATE;
650                                         else {
651                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
652                                                 return r;
653                                         }
654                                 } else
655                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
656                         }
657
658                         break;
659
660                 case 'p': {
661                         const char *split, *e;
662                         uint16_t container_port, host_port;
663                         int protocol;
664                         ExposePort *p;
665
666                         if ((e = startswith(optarg, "tcp:")))
667                                 protocol = IPPROTO_TCP;
668                         else if ((e = startswith(optarg, "udp:")))
669                                 protocol = IPPROTO_UDP;
670                         else {
671                                 e = optarg;
672                                 protocol = IPPROTO_TCP;
673                         }
674
675                         split = strchr(e, ':');
676                         if (split) {
677                                 char v[split - e + 1];
678
679                                 memcpy(v, e, split - e);
680                                 v[split - e] = 0;
681
682                                 r = safe_atou16(v, &host_port);
683                                 if (r < 0 || host_port <= 0) {
684                                         log_error("Failed to parse host port: %s", optarg);
685                                         return -EINVAL;
686                                 }
687
688                                 r = safe_atou16(split + 1, &container_port);
689                         } else {
690                                 r = safe_atou16(e, &container_port);
691                                 host_port = container_port;
692                         }
693
694                         if (r < 0 || container_port <= 0) {
695                                 log_error("Failed to parse host port: %s", optarg);
696                                 return -EINVAL;
697                         }
698
699                         LIST_FOREACH(ports, p, arg_expose_ports) {
700                                 if (p->protocol == protocol && p->host_port == host_port) {
701                                         log_error("Duplicate port specification: %s", optarg);
702                                         return -EINVAL;
703                                 }
704                         }
705
706                         p = new(ExposePort, 1);
707                         if (!p)
708                                 return log_oom();
709
710                         p->protocol = protocol;
711                         p->host_port = host_port;
712                         p->container_port = container_port;
713
714                         LIST_PREPEND(ports, arg_expose_ports, p);
715
716                         break;
717                 }
718
719                 case '?':
720                         return -EINVAL;
721
722                 default:
723                         assert_not_reached("Unhandled option");
724                 }
725
726         if (arg_share_system)
727                 arg_register = false;
728
729         if (arg_boot && arg_share_system) {
730                 log_error("--boot and --share-system may not be combined.");
731                 return -EINVAL;
732         }
733
734         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
735                 log_error("--keep-unit may not be used when invoked from a user session.");
736                 return -EINVAL;
737         }
738
739         if (arg_directory && arg_image) {
740                 log_error("--directory= and --image= may not be combined.");
741                 return -EINVAL;
742         }
743
744         if (arg_template && arg_image) {
745                 log_error("--template= and --image= may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_template && !(arg_directory || arg_machine)) {
750                 log_error("--template= needs --directory= or --machine=.");
751                 return -EINVAL;
752         }
753
754         if (arg_ephemeral && arg_template) {
755                 log_error("--ephemeral and --template= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_ephemeral && arg_image) {
760                 log_error("--ephemeral and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
765                 log_error("--ephemeral and --link-journal= may not be combined.");
766                 return -EINVAL;
767         }
768
769         if (arg_volatile != VOLATILE_NO && arg_read_only) {
770                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
771                 return -EINVAL;
772         }
773
774         if (arg_expose_ports && !arg_private_network) {
775                 log_error("Cannot use --port= without private networking.");
776                 return -EINVAL;
777         }
778
779         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
780
781         return 1;
782 }
783
784 static int mount_all(const char *dest) {
785
786         typedef struct MountPoint {
787                 const char *what;
788                 const char *where;
789                 const char *type;
790                 const char *options;
791                 unsigned long flags;
792                 bool fatal;
793         } MountPoint;
794
795         static const MountPoint mount_table[] = {
796                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
797                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
798                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
799                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
800                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
801                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
802                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
803                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
804 #ifdef HAVE_SELINUX
805                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
806                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
807 #endif
808         };
809
810         unsigned k;
811         int r = 0;
812
813         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
814                 _cleanup_free_ char *where = NULL;
815 #ifdef HAVE_SELINUX
816                 _cleanup_free_ char *options = NULL;
817 #endif
818                 const char *o;
819                 int t;
820
821                 where = strjoin(dest, "/", mount_table[k].where, NULL);
822                 if (!where)
823                         return log_oom();
824
825                 t = path_is_mount_point(where, true);
826                 if (t < 0) {
827                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
828
829                         if (r == 0)
830                                 r = t;
831
832                         continue;
833                 }
834
835                 /* Skip this entry if it is not a remount. */
836                 if (mount_table[k].what && t > 0)
837                         continue;
838
839                 t = mkdir_p(where, 0755);
840                 if (t < 0) {
841                         if (mount_table[k].fatal) {
842                                log_error_errno(t, "Failed to create directory %s: %m", where);
843
844                                 if (r == 0)
845                                         r = t;
846                         } else
847                                log_warning_errno(t, "Failed to create directory %s: %m", where);
848
849                         continue;
850                 }
851
852 #ifdef HAVE_SELINUX
853                 if (arg_selinux_apifs_context &&
854                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
855                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
856                         if (!options)
857                                 return log_oom();
858
859                         o = options;
860                 } else
861 #endif
862                         o = mount_table[k].options;
863
864
865                 if (mount(mount_table[k].what,
866                           where,
867                           mount_table[k].type,
868                           mount_table[k].flags,
869                           o) < 0) {
870
871                         if (mount_table[k].fatal) {
872                                 log_error_errno(errno, "mount(%s) failed: %m", where);
873
874                                 if (r == 0)
875                                         r = -errno;
876                         } else
877                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
878                 }
879         }
880
881         return r;
882 }
883
884 static int mount_binds(const char *dest, char **l, bool ro) {
885         char **x, **y;
886
887         STRV_FOREACH_PAIR(x, y, l) {
888                 _cleanup_free_ char *where = NULL;
889                 struct stat source_st, dest_st;
890                 int r;
891
892                 if (stat(*x, &source_st) < 0)
893                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
894
895                 where = strappend(dest, *y);
896                 if (!where)
897                         return log_oom();
898
899                 r = stat(where, &dest_st);
900                 if (r == 0) {
901                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
902                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
903                                 return -EINVAL;
904                         }
905                 } else if (errno == ENOENT) {
906                         r = mkdir_parents_label(where, 0755);
907                         if (r < 0)
908                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
909                 } else {
910                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
911                         return -errno;
912                 }
913
914                 /* Create the mount point, but be conservative -- refuse to create block
915                  * and char devices. */
916                 if (S_ISDIR(source_st.st_mode)) {
917                         r = mkdir_label(where, 0755);
918                         if (r < 0 && errno != EEXIST)
919                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
920                 } else if (S_ISFIFO(source_st.st_mode)) {
921                         r = mkfifo(where, 0644);
922                         if (r < 0 && errno != EEXIST)
923                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
924                 } else if (S_ISSOCK(source_st.st_mode)) {
925                         r = mknod(where, 0644 | S_IFSOCK, 0);
926                         if (r < 0 && errno != EEXIST)
927                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
928                 } else if (S_ISREG(source_st.st_mode)) {
929                         r = touch(where);
930                         if (r < 0)
931                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
932                 } else {
933                         log_error("Refusing to create mountpoint for file: %s", *x);
934                         return -ENOTSUP;
935                 }
936
937                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
938                         return log_error_errno(errno, "mount(%s) failed: %m", where);
939
940                 if (ro) {
941                         r = bind_remount_recursive(where, true);
942                         if (r < 0)
943                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
944                 }
945         }
946
947         return 0;
948 }
949
950 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
951         char *to;
952         int r;
953
954         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
955
956         r = path_is_mount_point(to, false);
957         if (r < 0)
958                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
959         if (r > 0)
960                 return 0;
961
962         mkdir_p(to, 0755);
963
964         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
965                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
966
967         return 1;
968 }
969
970 static int mount_cgroup(const char *dest) {
971         _cleanup_set_free_free_ Set *controllers = NULL;
972         _cleanup_free_ char *own_cgroup_path = NULL;
973         const char *cgroup_root, *systemd_root, *systemd_own;
974         int r;
975
976         controllers = set_new(&string_hash_ops);
977         if (!controllers)
978                 return log_oom();
979
980         r = cg_kernel_controllers(controllers);
981         if (r < 0)
982                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
983
984         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
985         if (r < 0)
986                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
987
988         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
989         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
990                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
991
992         for (;;) {
993                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
994
995                 controller = set_steal_first(controllers);
996                 if (!controller)
997                         break;
998
999                 origin = strappend("/sys/fs/cgroup/", controller);
1000                 if (!origin)
1001                         return log_oom();
1002
1003                 r = readlink_malloc(origin, &combined);
1004                 if (r == -EINVAL) {
1005                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1006
1007                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1008                         if (r < 0)
1009                                 return r;
1010
1011                 } else if (r < 0)
1012                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1013                 else {
1014                         _cleanup_free_ char *target = NULL;
1015
1016                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1017                         if (!target)
1018                                 return log_oom();
1019
1020                         /* A symbolic link, a combination of controllers in one hierarchy */
1021
1022                         if (!filename_is_valid(combined)) {
1023                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1024                                 continue;
1025                         }
1026
1027                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1028                         if (r < 0)
1029                                 return r;
1030
1031                         if (symlink(combined, target) < 0)
1032                                 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
1033                 }
1034         }
1035
1036         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1037         if (r < 0)
1038                 return r;
1039
1040         /* Make our own cgroup a (writable) bind mount */
1041         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1042         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1043                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1044
1045         /* And then remount the systemd cgroup root read-only */
1046         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1047         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1048                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1049
1050         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1051                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1052
1053         return 0;
1054 }
1055
1056 static int mount_tmpfs(const char *dest) {
1057         char **i, **o;
1058
1059         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1060                 _cleanup_free_ char *where = NULL;
1061                 int r;
1062
1063                 where = strappend(dest, *i);
1064                 if (!where)
1065                         return log_oom();
1066
1067                 r = mkdir_label(where, 0755);
1068                 if (r < 0 && r != -EEXIST)
1069                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1070
1071                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1072                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1073         }
1074
1075         return 0;
1076 }
1077
1078 static int setup_timezone(const char *dest) {
1079         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1080         char *z, *y;
1081         int r;
1082
1083         assert(dest);
1084
1085         /* Fix the timezone, if possible */
1086         r = readlink_malloc("/etc/localtime", &p);
1087         if (r < 0) {
1088                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1089                 return 0;
1090         }
1091
1092         z = path_startswith(p, "../usr/share/zoneinfo/");
1093         if (!z)
1094                 z = path_startswith(p, "/usr/share/zoneinfo/");
1095         if (!z) {
1096                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1097                 return 0;
1098         }
1099
1100         where = strappend(dest, "/etc/localtime");
1101         if (!where)
1102                 return log_oom();
1103
1104         r = readlink_malloc(where, &q);
1105         if (r >= 0) {
1106                 y = path_startswith(q, "../usr/share/zoneinfo/");
1107                 if (!y)
1108                         y = path_startswith(q, "/usr/share/zoneinfo/");
1109
1110                 /* Already pointing to the right place? Then do nothing .. */
1111                 if (y && streq(y, z))
1112                         return 0;
1113         }
1114
1115         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1116         if (!check)
1117                 return log_oom();
1118
1119         if (access(check, F_OK) < 0) {
1120                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1121                 return 0;
1122         }
1123
1124         what = strappend("../usr/share/zoneinfo/", z);
1125         if (!what)
1126                 return log_oom();
1127
1128         r = mkdir_parents(where, 0755);
1129         if (r < 0) {
1130                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1131
1132                 return 0;
1133         }
1134
1135         r = unlink(where);
1136         if (r < 0 && errno != ENOENT) {
1137                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1138
1139                 return 0;
1140         }
1141
1142         if (symlink(what, where) < 0) {
1143                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1144                 return 0;
1145         }
1146
1147         return 0;
1148 }
1149
1150 static int setup_resolv_conf(const char *dest) {
1151         _cleanup_free_ char *where = NULL;
1152         int r;
1153
1154         assert(dest);
1155
1156         if (arg_private_network)
1157                 return 0;
1158
1159         /* Fix resolv.conf, if possible */
1160         where = strappend(dest, "/etc/resolv.conf");
1161         if (!where)
1162                 return log_oom();
1163
1164         /* We don't really care for the results of this really. If it
1165          * fails, it fails, but meh... */
1166         r = mkdir_parents(where, 0755);
1167         if (r < 0) {
1168                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1169
1170                 return 0;
1171         }
1172
1173         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1174         if (r < 0) {
1175                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1176
1177                 return 0;
1178         }
1179
1180         return 0;
1181 }
1182
1183 static int setup_volatile_state(const char *directory) {
1184         const char *p;
1185         int r;
1186
1187         assert(directory);
1188
1189         if (arg_volatile != VOLATILE_STATE)
1190                 return 0;
1191
1192         /* --volatile=state means we simply overmount /var
1193            with a tmpfs, and the rest read-only. */
1194
1195         r = bind_remount_recursive(directory, true);
1196         if (r < 0)
1197                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1198
1199         p = strappenda(directory, "/var");
1200         r = mkdir(p, 0755);
1201         if (r < 0 && errno != EEXIST)
1202                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1203
1204         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1205                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1206
1207         return 0;
1208 }
1209
1210 static int setup_volatile(const char *directory) {
1211         bool tmpfs_mounted = false, bind_mounted = false;
1212         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1213         const char *f, *t;
1214         int r;
1215
1216         assert(directory);
1217
1218         if (arg_volatile != VOLATILE_YES)
1219                 return 0;
1220
1221         /* --volatile=yes means we mount a tmpfs to the root dir, and
1222            the original /usr to use inside it, and that read-only. */
1223
1224         if (!mkdtemp(template))
1225                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1226
1227         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1228                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1229                 r = -errno;
1230                 goto fail;
1231         }
1232
1233         tmpfs_mounted = true;
1234
1235         f = strappenda(directory, "/usr");
1236         t = strappenda(template, "/usr");
1237
1238         r = mkdir(t, 0755);
1239         if (r < 0 && errno != EEXIST) {
1240                 log_error_errno(errno, "Failed to create %s: %m", t);
1241                 r = -errno;
1242                 goto fail;
1243         }
1244
1245         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1246                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1247                 r = -errno;
1248                 goto fail;
1249         }
1250
1251         bind_mounted = true;
1252
1253         r = bind_remount_recursive(t, true);
1254         if (r < 0) {
1255                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1256                 goto fail;
1257         }
1258
1259         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1260                 log_error_errno(errno, "Failed to move root mount: %m");
1261                 r = -errno;
1262                 goto fail;
1263         }
1264
1265         rmdir(template);
1266
1267         return 0;
1268
1269 fail:
1270         if (bind_mounted)
1271                 umount(t);
1272         if (tmpfs_mounted)
1273                 umount(template);
1274         rmdir(template);
1275         return r;
1276 }
1277
1278 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1279
1280         snprintf(s, 37,
1281                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1282                  SD_ID128_FORMAT_VAL(id));
1283
1284         return s;
1285 }
1286
1287 static int setup_boot_id(const char *dest) {
1288         _cleanup_free_ char *from = NULL, *to = NULL;
1289         sd_id128_t rnd = {};
1290         char as_uuid[37];
1291         int r;
1292
1293         assert(dest);
1294
1295         if (arg_share_system)
1296                 return 0;
1297
1298         /* Generate a new randomized boot ID, so that each boot-up of
1299          * the container gets a new one */
1300
1301         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1302         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1303         if (!from || !to)
1304                 return log_oom();
1305
1306         r = sd_id128_randomize(&rnd);
1307         if (r < 0)
1308                 return log_error_errno(r, "Failed to generate random boot id: %m");
1309
1310         id128_format_as_uuid(rnd, as_uuid);
1311
1312         r = write_string_file(from, as_uuid);
1313         if (r < 0)
1314                 return log_error_errno(r, "Failed to write boot id: %m");
1315
1316         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1317                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1318                 r = -errno;
1319         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1320                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1321
1322         unlink(from);
1323         return r;
1324 }
1325
1326 static int copy_devnodes(const char *dest) {
1327
1328         static const char devnodes[] =
1329                 "null\0"
1330                 "zero\0"
1331                 "full\0"
1332                 "random\0"
1333                 "urandom\0"
1334                 "tty\0"
1335                 "net/tun\0";
1336
1337         const char *d;
1338         int r = 0;
1339         _cleanup_umask_ mode_t u;
1340
1341         assert(dest);
1342
1343         u = umask(0000);
1344
1345         NULSTR_FOREACH(d, devnodes) {
1346                 _cleanup_free_ char *from = NULL, *to = NULL;
1347                 struct stat st;
1348
1349                 from = strappend("/dev/", d);
1350                 to = strjoin(dest, "/dev/", d, NULL);
1351                 if (!from || !to)
1352                         return log_oom();
1353
1354                 if (stat(from, &st) < 0) {
1355
1356                         if (errno != ENOENT)
1357                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1358
1359                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1360
1361                         log_error("%s is not a char or block device, cannot copy", from);
1362                         return -EIO;
1363
1364                 } else {
1365                         r = mkdir_parents(to, 0775);
1366                         if (r < 0) {
1367                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1368                                 return -r;
1369                         }
1370
1371                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1372                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1373                 }
1374         }
1375
1376         return r;
1377 }
1378
1379 static int setup_ptmx(const char *dest) {
1380         _cleanup_free_ char *p = NULL;
1381
1382         p = strappend(dest, "/dev/ptmx");
1383         if (!p)
1384                 return log_oom();
1385
1386         if (symlink("pts/ptmx", p) < 0)
1387                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1388
1389         return 0;
1390 }
1391
1392 static int setup_dev_console(const char *dest, const char *console) {
1393         _cleanup_umask_ mode_t u;
1394         const char *to;
1395         struct stat st;
1396         int r;
1397
1398         assert(dest);
1399         assert(console);
1400
1401         u = umask(0000);
1402
1403         if (stat("/dev/null", &st) < 0)
1404                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1405
1406         r = chmod_and_chown(console, 0600, 0, 0);
1407         if (r < 0)
1408                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1409
1410         /* We need to bind mount the right tty to /dev/console since
1411          * ptys can only exist on pts file systems. To have something
1412          * to bind mount things on we create a device node first, and
1413          * use /dev/null for that since we the cgroups device policy
1414          * allows us to create that freely, while we cannot create
1415          * /dev/console. (Note that the major minor doesn't actually
1416          * matter here, since we mount it over anyway). */
1417
1418         to = strappenda(dest, "/dev/console");
1419         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1420                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1421
1422         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1423                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1424
1425         return 0;
1426 }
1427
1428 static int setup_kmsg(const char *dest, int kmsg_socket) {
1429         _cleanup_free_ char *from = NULL, *to = NULL;
1430         _cleanup_umask_ mode_t u;
1431         int r, fd, k;
1432         union {
1433                 struct cmsghdr cmsghdr;
1434                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1435         } control = {};
1436         struct msghdr mh = {
1437                 .msg_control = &control,
1438                 .msg_controllen = sizeof(control),
1439         };
1440         struct cmsghdr *cmsg;
1441
1442         assert(dest);
1443         assert(kmsg_socket >= 0);
1444
1445         u = umask(0000);
1446
1447         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1448          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1449          * on the reading side behave very similar to /proc/kmsg,
1450          * their writing side behaves differently from /dev/kmsg in
1451          * that writing blocks when nothing is reading. In order to
1452          * avoid any problems with containers deadlocking due to this
1453          * we simply make /dev/kmsg unavailable to the container. */
1454         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1455             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1456                 return log_oom();
1457
1458         if (mkfifo(from, 0600) < 0)
1459                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1460
1461         r = chmod_and_chown(from, 0600, 0, 0);
1462         if (r < 0)
1463                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1464
1465         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1466                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1467
1468         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1469         if (fd < 0)
1470                 return log_error_errno(errno, "Failed to open fifo: %m");
1471
1472         cmsg = CMSG_FIRSTHDR(&mh);
1473         cmsg->cmsg_level = SOL_SOCKET;
1474         cmsg->cmsg_type = SCM_RIGHTS;
1475         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1476         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1477
1478         mh.msg_controllen = cmsg->cmsg_len;
1479
1480         /* Store away the fd in the socket, so that it stays open as
1481          * long as we run the child */
1482         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1483         safe_close(fd);
1484
1485         if (k < 0)
1486                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1487
1488         /* And now make the FIFO unavailable as /dev/kmsg... */
1489         unlink(from);
1490         return 0;
1491 }
1492
1493 static int send_rtnl(int send_fd) {
1494         union {
1495                 struct cmsghdr cmsghdr;
1496                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1497         } control = {};
1498         struct msghdr mh = {
1499                 .msg_control = &control,
1500                 .msg_controllen = sizeof(control),
1501         };
1502         struct cmsghdr *cmsg;
1503         _cleanup_close_ int fd = -1;
1504         ssize_t k;
1505
1506         assert(send_fd >= 0);
1507
1508         if (!arg_expose_ports)
1509                 return 0;
1510
1511         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1512         if (fd < 0)
1513                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1514
1515         cmsg = CMSG_FIRSTHDR(&mh);
1516         cmsg->cmsg_level = SOL_SOCKET;
1517         cmsg->cmsg_type = SCM_RIGHTS;
1518         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1519         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1520
1521         mh.msg_controllen = cmsg->cmsg_len;
1522
1523         /* Store away the fd in the socket, so that it stays open as
1524          * long as we run the child */
1525         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1526         if (k < 0)
1527                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1528
1529         return 0;
1530 }
1531
1532 static int flush_ports(union in_addr_union *exposed) {
1533         ExposePort *p;
1534         int r, af = AF_INET;
1535
1536         assert(exposed);
1537
1538         if (!arg_expose_ports)
1539                 return 0;
1540
1541         if (in_addr_is_null(af, exposed))
1542                 return 0;
1543
1544         log_debug("Lost IP address.");
1545
1546         LIST_FOREACH(ports, p, arg_expose_ports) {
1547                 r = fw_add_local_dnat(false,
1548                                       af,
1549                                       p->protocol,
1550                                       NULL,
1551                                       NULL, 0,
1552                                       NULL, 0,
1553                                       p->host_port,
1554                                       exposed,
1555                                       p->container_port,
1556                                       NULL);
1557                 if (r < 0)
1558                         log_warning_errno(r, "Failed to modify firewall: %m");
1559         }
1560
1561         *exposed = IN_ADDR_NULL;
1562         return 0;
1563 }
1564
1565 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1566         _cleanup_free_ struct local_address *addresses = NULL;
1567         _cleanup_free_ char *pretty = NULL;
1568         union in_addr_union new_exposed;
1569         ExposePort *p;
1570         bool add;
1571         int af = AF_INET, r;
1572
1573         assert(exposed);
1574
1575         /* Invoked each time an address is added or removed inside the
1576          * container */
1577
1578         if (!arg_expose_ports)
1579                 return 0;
1580
1581         r = local_addresses(rtnl, 0, af, &addresses);
1582         if (r < 0)
1583                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1584
1585         add = r > 0 &&
1586                 addresses[0].family == af &&
1587                 addresses[0].scope < RT_SCOPE_LINK;
1588
1589         if (!add)
1590                 return flush_ports(exposed);
1591
1592         new_exposed = addresses[0].address;
1593         if (in_addr_equal(af, exposed, &new_exposed))
1594                 return 0;
1595
1596         in_addr_to_string(af, &new_exposed, &pretty);
1597         log_debug("New container IP is %s.", strna(pretty));
1598
1599         LIST_FOREACH(ports, p, arg_expose_ports) {
1600
1601                 r = fw_add_local_dnat(true,
1602                                       af,
1603                                       p->protocol,
1604                                       NULL,
1605                                       NULL, 0,
1606                                       NULL, 0,
1607                                       p->host_port,
1608                                       &new_exposed,
1609                                       p->container_port,
1610                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1611                 if (r < 0)
1612                         log_warning_errno(r, "Failed to modify firewall: %m");
1613         }
1614
1615         *exposed = new_exposed;
1616         return 0;
1617 }
1618
1619 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1620         union in_addr_union *exposed = userdata;
1621
1622         assert(rtnl);
1623         assert(m);
1624         assert(exposed);
1625
1626         expose_ports(rtnl, exposed);
1627         return 0;
1628 }
1629
1630 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1631         union {
1632                 struct cmsghdr cmsghdr;
1633                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1634         } control = {};
1635         struct msghdr mh = {
1636                 .msg_control = &control,
1637                 .msg_controllen = sizeof(control),
1638         };
1639         struct cmsghdr *cmsg;
1640         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1641         int fd, r;
1642         ssize_t k;
1643
1644         assert(event);
1645         assert(recv_fd >= 0);
1646         assert(ret);
1647
1648         if (!arg_expose_ports)
1649                 return 0;
1650
1651         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1652         if (k < 0)
1653                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1654
1655         cmsg = CMSG_FIRSTHDR(&mh);
1656         assert(cmsg->cmsg_level == SOL_SOCKET);
1657         assert(cmsg->cmsg_type == SCM_RIGHTS);
1658         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1659         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1660
1661         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1662         if (r < 0) {
1663                 safe_close(fd);
1664                 return log_error_errno(r, "Failed to create rtnl object: %m");
1665         }
1666
1667         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1668         if (r < 0)
1669                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1670
1671         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1672         if (r < 0)
1673                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1674
1675         r = sd_rtnl_attach_event(rtnl, event, 0);
1676         if (r < 0)
1677                 return log_error_errno(r, "Failed to add to even loop: %m");
1678
1679         *ret = rtnl;
1680         rtnl = NULL;
1681
1682         return 0;
1683 }
1684
1685 static int setup_hostname(void) {
1686
1687         if (arg_share_system)
1688                 return 0;
1689
1690         if (sethostname_idempotent(arg_machine) < 0)
1691                 return -errno;
1692
1693         return 0;
1694 }
1695
1696 static int setup_journal(const char *directory) {
1697         sd_id128_t machine_id, this_id;
1698         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1699         char *id;
1700         int r;
1701
1702         /* Don't link journals in ephemeral mode */
1703         if (arg_ephemeral)
1704                 return 0;
1705
1706         p = strappend(directory, "/etc/machine-id");
1707         if (!p)
1708                 return log_oom();
1709
1710         r = read_one_line_file(p, &b);
1711         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1712                 return 0;
1713         else if (r < 0)
1714                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1715
1716         id = strstrip(b);
1717         if (isempty(id) && arg_link_journal == LINK_AUTO)
1718                 return 0;
1719
1720         /* Verify validity */
1721         r = sd_id128_from_string(id, &machine_id);
1722         if (r < 0)
1723                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1724
1725         r = sd_id128_get_machine(&this_id);
1726         if (r < 0)
1727                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1728
1729         if (sd_id128_equal(machine_id, this_id)) {
1730                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1731                          "Host and machine ids are equal (%s): refusing to link journals", id);
1732                 if (arg_link_journal == LINK_AUTO)
1733                         return 0;
1734                 return -EEXIST;
1735         }
1736
1737         if (arg_link_journal == LINK_NO)
1738                 return 0;
1739
1740         free(p);
1741         p = strappend("/var/log/journal/", id);
1742         q = strjoin(directory, "/var/log/journal/", id, NULL);
1743         if (!p || !q)
1744                 return log_oom();
1745
1746         if (path_is_mount_point(p, false) > 0) {
1747                 if (arg_link_journal != LINK_AUTO) {
1748                         log_error("%s: already a mount point, refusing to use for journal", p);
1749                         return -EEXIST;
1750                 }
1751
1752                 return 0;
1753         }
1754
1755         if (path_is_mount_point(q, false) > 0) {
1756                 if (arg_link_journal != LINK_AUTO) {
1757                         log_error("%s: already a mount point, refusing to use for journal", q);
1758                         return -EEXIST;
1759                 }
1760
1761                 return 0;
1762         }
1763
1764         r = readlink_and_make_absolute(p, &d);
1765         if (r >= 0) {
1766                 if ((arg_link_journal == LINK_GUEST ||
1767                      arg_link_journal == LINK_AUTO) &&
1768                     path_equal(d, q)) {
1769
1770                         r = mkdir_p(q, 0755);
1771                         if (r < 0)
1772                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1773                         return 0;
1774                 }
1775
1776                 if (unlink(p) < 0)
1777                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1778         } else if (r == -EINVAL) {
1779
1780                 if (arg_link_journal == LINK_GUEST &&
1781                     rmdir(p) < 0) {
1782
1783                         if (errno == ENOTDIR) {
1784                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1785                                 return r;
1786                         } else {
1787                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1788                                 return -errno;
1789                         }
1790                 }
1791         } else if (r != -ENOENT) {
1792                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1793                 return r;
1794         }
1795
1796         if (arg_link_journal == LINK_GUEST) {
1797
1798                 if (symlink(q, p) < 0) {
1799                         if (arg_link_journal_try) {
1800                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1801                                 return 0;
1802                         } else {
1803                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1804                                 return -errno;
1805                         }
1806                 }
1807
1808                 r = mkdir_p(q, 0755);
1809                 if (r < 0)
1810                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1811                 return 0;
1812         }
1813
1814         if (arg_link_journal == LINK_HOST) {
1815                 /* don't create parents here -- if the host doesn't have
1816                  * permanent journal set up, don't force it here */
1817                 r = mkdir(p, 0755);
1818                 if (r < 0) {
1819                         if (arg_link_journal_try) {
1820                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1821                                 return 0;
1822                         } else {
1823                                 log_error_errno(errno, "Failed to create %s: %m", p);
1824                                 return r;
1825                         }
1826                 }
1827
1828         } else if (access(p, F_OK) < 0)
1829                 return 0;
1830
1831         if (dir_is_empty(q) == 0)
1832                 log_warning("%s is not empty, proceeding anyway.", q);
1833
1834         r = mkdir_p(q, 0755);
1835         if (r < 0) {
1836                 log_error_errno(errno, "Failed to create %s: %m", q);
1837                 return r;
1838         }
1839
1840         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1841                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1842
1843         return 0;
1844 }
1845
1846 static int drop_capabilities(void) {
1847         return capability_bounding_set_drop(~arg_retain, false);
1848 }
1849
1850 static int register_machine(pid_t pid, int local_ifindex) {
1851         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1852         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1853         int r;
1854
1855         if (!arg_register)
1856                 return 0;
1857
1858         r = sd_bus_default_system(&bus);
1859         if (r < 0)
1860                 return log_error_errno(r, "Failed to open system bus: %m");
1861
1862         if (arg_keep_unit) {
1863                 r = sd_bus_call_method(
1864                                 bus,
1865                                 "org.freedesktop.machine1",
1866                                 "/org/freedesktop/machine1",
1867                                 "org.freedesktop.machine1.Manager",
1868                                 "RegisterMachineWithNetwork",
1869                                 &error,
1870                                 NULL,
1871                                 "sayssusai",
1872                                 arg_machine,
1873                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1874                                 "nspawn",
1875                                 "container",
1876                                 (uint32_t) pid,
1877                                 strempty(arg_directory),
1878                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1879         } else {
1880                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1881
1882                 r = sd_bus_message_new_method_call(
1883                                 bus,
1884                                 &m,
1885                                 "org.freedesktop.machine1",
1886                                 "/org/freedesktop/machine1",
1887                                 "org.freedesktop.machine1.Manager",
1888                                 "CreateMachineWithNetwork");
1889                 if (r < 0)
1890                         return log_error_errno(r, "Failed to create message: %m");
1891
1892                 r = sd_bus_message_append(
1893                                 m,
1894                                 "sayssusai",
1895                                 arg_machine,
1896                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1897                                 "nspawn",
1898                                 "container",
1899                                 (uint32_t) pid,
1900                                 strempty(arg_directory),
1901                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1902                 if (r < 0)
1903                         return log_error_errno(r, "Failed to append message arguments: %m");
1904
1905                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1906                 if (r < 0)
1907                         return log_error_errno(r, "Failed to open container: %m");
1908
1909                 if (!isempty(arg_slice)) {
1910                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1911                         if (r < 0)
1912                                 return log_error_errno(r, "Failed to append slice: %m");
1913                 }
1914
1915                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1916                 if (r < 0)
1917                         return log_error_errno(r, "Failed to add device policy: %m");
1918
1919                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1920                                           /* Allow the container to
1921                                            * access and create the API
1922                                            * device nodes, so that
1923                                            * PrivateDevices= in the
1924                                            * container can work
1925                                            * fine */
1926                                           "/dev/null", "rwm",
1927                                           "/dev/zero", "rwm",
1928                                           "/dev/full", "rwm",
1929                                           "/dev/random", "rwm",
1930                                           "/dev/urandom", "rwm",
1931                                           "/dev/tty", "rwm",
1932                                           "/dev/net/tun", "rwm",
1933                                           /* Allow the container
1934                                            * access to ptys. However,
1935                                            * do not permit the
1936                                            * container to ever create
1937                                            * these device nodes. */
1938                                           "/dev/pts/ptmx", "rw",
1939                                           "char-pts", "rw");
1940                 if (r < 0)
1941                         return log_error_errno(r, "Failed to add device whitelist: %m");
1942
1943                 r = sd_bus_message_close_container(m);
1944                 if (r < 0)
1945                         return log_error_errno(r, "Failed to close container: %m");
1946
1947                 r = sd_bus_call(bus, m, 0, &error, NULL);
1948         }
1949
1950         if (r < 0) {
1951                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1952                 return r;
1953         }
1954
1955         return 0;
1956 }
1957
1958 static int terminate_machine(pid_t pid) {
1959         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1960         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1961         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1962         const char *path;
1963         int r;
1964
1965         if (!arg_register)
1966                 return 0;
1967
1968         r = sd_bus_default_system(&bus);
1969         if (r < 0)
1970                 return log_error_errno(r, "Failed to open system bus: %m");
1971
1972         r = sd_bus_call_method(
1973                         bus,
1974                         "org.freedesktop.machine1",
1975                         "/org/freedesktop/machine1",
1976                         "org.freedesktop.machine1.Manager",
1977                         "GetMachineByPID",
1978                         &error,
1979                         &reply,
1980                         "u",
1981                         (uint32_t) pid);
1982         if (r < 0) {
1983                 /* Note that the machine might already have been
1984                  * cleaned up automatically, hence don't consider it a
1985                  * failure if we cannot get the machine object. */
1986                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1987                 return 0;
1988         }
1989
1990         r = sd_bus_message_read(reply, "o", &path);
1991         if (r < 0)
1992                 return bus_log_parse_error(r);
1993
1994         r = sd_bus_call_method(
1995                         bus,
1996                         "org.freedesktop.machine1",
1997                         path,
1998                         "org.freedesktop.machine1.Machine",
1999                         "Terminate",
2000                         &error,
2001                         NULL,
2002                         NULL);
2003         if (r < 0) {
2004                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2005                 return 0;
2006         }
2007
2008         return 0;
2009 }
2010
2011 static int reset_audit_loginuid(void) {
2012         _cleanup_free_ char *p = NULL;
2013         int r;
2014
2015         if (arg_share_system)
2016                 return 0;
2017
2018         r = read_one_line_file("/proc/self/loginuid", &p);
2019         if (r == -ENOENT)
2020                 return 0;
2021         if (r < 0)
2022                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2023
2024         /* Already reset? */
2025         if (streq(p, "4294967295"))
2026                 return 0;
2027
2028         r = write_string_file("/proc/self/loginuid", "4294967295");
2029         if (r < 0) {
2030                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2031                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2032                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2033                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2034                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2035
2036                 sleep(5);
2037         }
2038
2039         return 0;
2040 }
2041
2042 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2043 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2044 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2045
2046 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2047         uint8_t result[8];
2048         size_t l, sz;
2049         uint8_t *v, *i;
2050         int r;
2051
2052         l = strlen(arg_machine);
2053         sz = sizeof(sd_id128_t) + l;
2054         if (idx > 0)
2055                 sz += sizeof(idx);
2056
2057         v = alloca(sz);
2058
2059         /* fetch some persistent data unique to the host */
2060         r = sd_id128_get_machine((sd_id128_t*) v);
2061         if (r < 0)
2062                 return r;
2063
2064         /* combine with some data unique (on this host) to this
2065          * container instance */
2066         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2067         if (idx > 0) {
2068                 idx = htole64(idx);
2069                 memcpy(i, &idx, sizeof(idx));
2070         }
2071
2072         /* Let's hash the host machine ID plus the container name. We
2073          * use a fixed, but originally randomly created hash key here. */
2074         siphash24(result, v, sz, hash_key.bytes);
2075
2076         assert_cc(ETH_ALEN <= sizeof(result));
2077         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2078
2079         /* see eth_random_addr in the kernel */
2080         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2081         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2082
2083         return 0;
2084 }
2085
2086 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2087         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2088         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2089         struct ether_addr mac_host, mac_container;
2090         int r, i;
2091
2092         if (!arg_private_network)
2093                 return 0;
2094
2095         if (!arg_network_veth)
2096                 return 0;
2097
2098         /* Use two different interface name prefixes depending whether
2099          * we are in bridge mode or not. */
2100         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2101                  arg_network_bridge ? "vb" : "ve", arg_machine);
2102
2103         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2104         if (r < 0)
2105                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2106
2107         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2108         if (r < 0)
2109                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2110
2111         r = sd_rtnl_open(&rtnl, 0);
2112         if (r < 0)
2113                 return log_error_errno(r, "Failed to connect to netlink: %m");
2114
2115         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2116         if (r < 0)
2117                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2118
2119         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2120         if (r < 0)
2121                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2122
2123         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2124         if (r < 0)
2125                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2126
2127         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2128         if (r < 0)
2129                 return log_error_errno(r, "Failed to open netlink container: %m");
2130
2131         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2132         if (r < 0)
2133                 return log_error_errno(r, "Failed to open netlink container: %m");
2134
2135         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2136         if (r < 0)
2137                 return log_error_errno(r, "Failed to open netlink container: %m");
2138
2139         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2140         if (r < 0)
2141                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2142
2143         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2144         if (r < 0)
2145                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2146
2147         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2148         if (r < 0)
2149                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2150
2151         r = sd_rtnl_message_close_container(m);
2152         if (r < 0)
2153                 return log_error_errno(r, "Failed to close netlink container: %m");
2154
2155         r = sd_rtnl_message_close_container(m);
2156         if (r < 0)
2157                 return log_error_errno(r, "Failed to close netlink container: %m");
2158
2159         r = sd_rtnl_message_close_container(m);
2160         if (r < 0)
2161                 return log_error_errno(r, "Failed to close netlink container: %m");
2162
2163         r = sd_rtnl_call(rtnl, m, 0, NULL);
2164         if (r < 0)
2165                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2166
2167         i = (int) if_nametoindex(iface_name);
2168         if (i <= 0)
2169                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2170
2171         *ifi = i;
2172
2173         return 0;
2174 }
2175
2176 static int setup_bridge(const char veth_name[], int *ifi) {
2177         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2178         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2179         int r, bridge;
2180
2181         if (!arg_private_network)
2182                 return 0;
2183
2184         if (!arg_network_veth)
2185                 return 0;
2186
2187         if (!arg_network_bridge)
2188                 return 0;
2189
2190         bridge = (int) if_nametoindex(arg_network_bridge);
2191         if (bridge <= 0)
2192                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2193
2194         *ifi = bridge;
2195
2196         r = sd_rtnl_open(&rtnl, 0);
2197         if (r < 0)
2198                 return log_error_errno(r, "Failed to connect to netlink: %m");
2199
2200         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2201         if (r < 0)
2202                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2203
2204         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2205         if (r < 0)
2206                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2207
2208         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2209         if (r < 0)
2210                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2211
2212         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2213         if (r < 0)
2214                 return log_error_errno(r, "Failed to add netlink master field: %m");
2215
2216         r = sd_rtnl_call(rtnl, m, 0, NULL);
2217         if (r < 0)
2218                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2219
2220         return 0;
2221 }
2222
2223 static int parse_interface(struct udev *udev, const char *name) {
2224         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2225         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2226         int ifi;
2227
2228         ifi = (int) if_nametoindex(name);
2229         if (ifi <= 0)
2230                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2231
2232         sprintf(ifi_str, "n%i", ifi);
2233         d = udev_device_new_from_device_id(udev, ifi_str);
2234         if (!d)
2235                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2236
2237         if (udev_device_get_is_initialized(d) <= 0) {
2238                 log_error("Network interface %s is not initialized yet.", name);
2239                 return -EBUSY;
2240         }
2241
2242         return ifi;
2243 }
2244
2245 static int move_network_interfaces(pid_t pid) {
2246         _cleanup_udev_unref_ struct udev *udev = NULL;
2247         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2248         char **i;
2249         int r;
2250
2251         if (!arg_private_network)
2252                 return 0;
2253
2254         if (strv_isempty(arg_network_interfaces))
2255                 return 0;
2256
2257         r = sd_rtnl_open(&rtnl, 0);
2258         if (r < 0)
2259                 return log_error_errno(r, "Failed to connect to netlink: %m");
2260
2261         udev = udev_new();
2262         if (!udev) {
2263                 log_error("Failed to connect to udev.");
2264                 return -ENOMEM;
2265         }
2266
2267         STRV_FOREACH(i, arg_network_interfaces) {
2268                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2269                 int ifi;
2270
2271                 ifi = parse_interface(udev, *i);
2272                 if (ifi < 0)
2273                         return ifi;
2274
2275                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2276                 if (r < 0)
2277                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2278
2279                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2280                 if (r < 0)
2281                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2282
2283                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2284                 if (r < 0)
2285                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2286         }
2287
2288         return 0;
2289 }
2290
2291 static int setup_macvlan(pid_t pid) {
2292         _cleanup_udev_unref_ struct udev *udev = NULL;
2293         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2294         unsigned idx = 0;
2295         char **i;
2296         int r;
2297
2298         if (!arg_private_network)
2299                 return 0;
2300
2301         if (strv_isempty(arg_network_macvlan))
2302                 return 0;
2303
2304         r = sd_rtnl_open(&rtnl, 0);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to connect to netlink: %m");
2307
2308         udev = udev_new();
2309         if (!udev) {
2310                 log_error("Failed to connect to udev.");
2311                 return -ENOMEM;
2312         }
2313
2314         STRV_FOREACH(i, arg_network_macvlan) {
2315                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2316                 _cleanup_free_ char *n = NULL;
2317                 struct ether_addr mac;
2318                 int ifi;
2319
2320                 ifi = parse_interface(udev, *i);
2321                 if (ifi < 0)
2322                         return ifi;
2323
2324                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2325                 if (r < 0)
2326                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2327
2328                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2329                 if (r < 0)
2330                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2331
2332                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2333                 if (r < 0)
2334                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2335
2336                 n = strappend("mv-", *i);
2337                 if (!n)
2338                         return log_oom();
2339
2340                 strshorten(n, IFNAMSIZ-1);
2341
2342                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2343                 if (r < 0)
2344                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2345
2346                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2347                 if (r < 0)
2348                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2349
2350                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2351                 if (r < 0)
2352                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2353
2354                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2355                 if (r < 0)
2356                         return log_error_errno(r, "Failed to open netlink container: %m");
2357
2358                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2359                 if (r < 0)
2360                         return log_error_errno(r, "Failed to open netlink container: %m");
2361
2362                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2363                 if (r < 0)
2364                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2365
2366                 r = sd_rtnl_message_close_container(m);
2367                 if (r < 0)
2368                         return log_error_errno(r, "Failed to close netlink container: %m");
2369
2370                 r = sd_rtnl_message_close_container(m);
2371                 if (r < 0)
2372                         return log_error_errno(r, "Failed to close netlink container: %m");
2373
2374                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2375                 if (r < 0)
2376                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2377         }
2378
2379         return 0;
2380 }
2381
2382 static int setup_seccomp(void) {
2383
2384 #ifdef HAVE_SECCOMP
2385         static const int blacklist[] = {
2386                 SCMP_SYS(kexec_load),
2387                 SCMP_SYS(open_by_handle_at),
2388                 SCMP_SYS(init_module),
2389                 SCMP_SYS(finit_module),
2390                 SCMP_SYS(delete_module),
2391                 SCMP_SYS(iopl),
2392                 SCMP_SYS(ioperm),
2393                 SCMP_SYS(swapon),
2394                 SCMP_SYS(swapoff),
2395         };
2396
2397         scmp_filter_ctx seccomp;
2398         unsigned i;
2399         int r;
2400
2401         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2402         if (!seccomp)
2403                 return log_oom();
2404
2405         r = seccomp_add_secondary_archs(seccomp);
2406         if (r < 0) {
2407                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2408                 goto finish;
2409         }
2410
2411         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2412                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2413                 if (r == -EFAULT)
2414                         continue; /* unknown syscall */
2415                 if (r < 0) {
2416                         log_error_errno(r, "Failed to block syscall: %m");
2417                         goto finish;
2418                 }
2419         }
2420
2421         /*
2422            Audit is broken in containers, much of the userspace audit
2423            hookup will fail if running inside a container. We don't
2424            care and just turn off creation of audit sockets.
2425
2426            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2427            with EAFNOSUPPORT which audit userspace uses as indication
2428            that audit is disabled in the kernel.
2429          */
2430
2431         r = seccomp_rule_add(
2432                         seccomp,
2433                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2434                         SCMP_SYS(socket),
2435                         2,
2436                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2437                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2438         if (r < 0) {
2439                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2440                 goto finish;
2441         }
2442
2443         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2444         if (r < 0) {
2445                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2446                 goto finish;
2447         }
2448
2449         r = seccomp_load(seccomp);
2450         if (r < 0)
2451                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2452
2453 finish:
2454         seccomp_release(seccomp);
2455         return r;
2456 #else
2457         return 0;
2458 #endif
2459
2460 }
2461
2462 static int setup_propagate(const char *root) {
2463         const char *p, *q;
2464
2465         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2466         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2467         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2468         (void) mkdir_p(p, 0600);
2469
2470         q = strappenda(root, "/run/systemd/nspawn/incoming");
2471         mkdir_parents(q, 0755);
2472         mkdir_p(q, 0600);
2473
2474         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2475                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2476
2477         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2478                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2479
2480         return 0;
2481 }
2482
2483 static int setup_image(char **device_path, int *loop_nr) {
2484         struct loop_info64 info = {
2485                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2486         };
2487         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2488         _cleanup_free_ char* loopdev = NULL;
2489         struct stat st;
2490         int r, nr;
2491
2492         assert(device_path);
2493         assert(loop_nr);
2494         assert(arg_image);
2495
2496         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2497         if (fd < 0)
2498                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2499
2500         if (fstat(fd, &st) < 0)
2501                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2502
2503         if (S_ISBLK(st.st_mode)) {
2504                 char *p;
2505
2506                 p = strdup(arg_image);
2507                 if (!p)
2508                         return log_oom();
2509
2510                 *device_path = p;
2511
2512                 *loop_nr = -1;
2513
2514                 r = fd;
2515                 fd = -1;
2516
2517                 return r;
2518         }
2519
2520         if (!S_ISREG(st.st_mode)) {
2521                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2522                 return -EINVAL;
2523         }
2524
2525         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2526         if (control < 0)
2527                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2528
2529         nr = ioctl(control, LOOP_CTL_GET_FREE);
2530         if (nr < 0)
2531                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2532
2533         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2534                 return log_oom();
2535
2536         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2537         if (loop < 0)
2538                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2539
2540         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2541                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2542
2543         if (arg_read_only)
2544                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2545
2546         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2547                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2548
2549         *device_path = loopdev;
2550         loopdev = NULL;
2551
2552         *loop_nr = nr;
2553
2554         r = loop;
2555         loop = -1;
2556
2557         return r;
2558 }
2559
2560 #define PARTITION_TABLE_BLURB \
2561         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2562         "type 0x83 that is marked bootable, or follow\n" \
2563         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2564         "to be bootable with systemd-nspawn."
2565
2566 static int dissect_image(
2567                 int fd,
2568                 char **root_device, bool *root_device_rw,
2569                 char **home_device, bool *home_device_rw,
2570                 char **srv_device, bool *srv_device_rw,
2571                 bool *secondary) {
2572
2573 #ifdef HAVE_BLKID
2574         int home_nr = -1, srv_nr = -1;
2575 #ifdef GPT_ROOT_NATIVE
2576         int root_nr = -1;
2577 #endif
2578 #ifdef GPT_ROOT_SECONDARY
2579         int secondary_root_nr = -1;
2580 #endif
2581
2582         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2583         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2584         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2585         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2586         _cleanup_udev_unref_ struct udev *udev = NULL;
2587         struct udev_list_entry *first, *item;
2588         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2589         const char *pttype = NULL;
2590         blkid_partlist pl;
2591         struct stat st;
2592         int r;
2593         bool is_gpt, is_mbr;
2594
2595         assert(fd >= 0);
2596         assert(root_device);
2597         assert(home_device);
2598         assert(srv_device);
2599         assert(secondary);
2600         assert(arg_image);
2601
2602         b = blkid_new_probe();
2603         if (!b)
2604                 return log_oom();
2605
2606         errno = 0;
2607         r = blkid_probe_set_device(b, fd, 0, 0);
2608         if (r != 0) {
2609                 if (errno == 0)
2610                         return log_oom();
2611
2612                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2613                 return -errno;
2614         }
2615
2616         blkid_probe_enable_partitions(b, 1);
2617         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2618
2619         errno = 0;
2620         r = blkid_do_safeprobe(b);
2621         if (r == -2 || r == 1) {
2622                 log_error("Failed to identify any partition table on\n"
2623                           "    %s\n"
2624                           PARTITION_TABLE_BLURB, arg_image);
2625                 return -EINVAL;
2626         } else if (r != 0) {
2627                 if (errno == 0)
2628                         errno = EIO;
2629                 log_error_errno(errno, "Failed to probe: %m");
2630                 return -errno;
2631         }
2632
2633         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2634
2635         is_gpt = streq_ptr(pttype, "gpt");
2636         is_mbr = streq_ptr(pttype, "dos");
2637
2638         if (!is_gpt && !is_mbr) {
2639                 log_error("No GPT or MBR partition table discovered on\n"
2640                           "    %s\n"
2641                           PARTITION_TABLE_BLURB, arg_image);
2642                 return -EINVAL;
2643         }
2644
2645         errno = 0;
2646         pl = blkid_probe_get_partitions(b);
2647         if (!pl) {
2648                 if (errno == 0)
2649                         return log_oom();
2650
2651                 log_error("Failed to list partitions of %s", arg_image);
2652                 return -errno;
2653         }
2654
2655         udev = udev_new();
2656         if (!udev)
2657                 return log_oom();
2658
2659         if (fstat(fd, &st) < 0)
2660                 return log_error_errno(errno, "Failed to stat block device: %m");
2661
2662         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2663         if (!d)
2664                 return log_oom();
2665
2666         e = udev_enumerate_new(udev);
2667         if (!e)
2668                 return log_oom();
2669
2670         r = udev_enumerate_add_match_parent(e, d);
2671         if (r < 0)
2672                 return log_oom();
2673
2674         r = udev_enumerate_scan_devices(e);
2675         if (r < 0)
2676                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2677
2678         first = udev_enumerate_get_list_entry(e);
2679         udev_list_entry_foreach(item, first) {
2680                 _cleanup_udev_device_unref_ struct udev_device *q;
2681                 const char *node;
2682                 unsigned long long flags;
2683                 blkid_partition pp;
2684                 dev_t qn;
2685                 int nr;
2686
2687                 errno = 0;
2688                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2689                 if (!q) {
2690                         if (!errno)
2691                                 errno = ENOMEM;
2692
2693                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2694                         return -errno;
2695                 }
2696
2697                 qn = udev_device_get_devnum(q);
2698                 if (major(qn) == 0)
2699                         continue;
2700
2701                 if (st.st_rdev == qn)
2702                         continue;
2703
2704                 node = udev_device_get_devnode(q);
2705                 if (!node)
2706                         continue;
2707
2708                 pp = blkid_partlist_devno_to_partition(pl, qn);
2709                 if (!pp)
2710                         continue;
2711
2712                 flags = blkid_partition_get_flags(pp);
2713                 if (is_gpt && (flags & GPT_FLAG_NO_AUTO))
2714                         continue;
2715                 if (is_mbr && (flags != 0x80)) /* Bootable flag */
2716                         continue;
2717
2718                 nr = blkid_partition_get_partno(pp);
2719                 if (nr < 0)
2720                         continue;
2721
2722                 if (is_gpt) {
2723                         sd_id128_t type_id;
2724                         const char *stype;
2725
2726                         stype = blkid_partition_get_type_string(pp);
2727                         if (!stype)
2728                                 continue;
2729
2730                         if (sd_id128_from_string(stype, &type_id) < 0)
2731                                 continue;
2732
2733                         if (sd_id128_equal(type_id, GPT_HOME)) {
2734
2735                                 if (home && nr >= home_nr)
2736                                         continue;
2737
2738                                 home_nr = nr;
2739                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2740
2741                                 r = free_and_strdup(&home, node);
2742                                 if (r < 0)
2743                                         return log_oom();
2744
2745                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2746
2747                                 if (srv && nr >= srv_nr)
2748                                         continue;
2749
2750                                 srv_nr = nr;
2751                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2752
2753                                 r = free_and_strdup(&srv, node);
2754                                 if (r < 0)
2755                                         return log_oom();
2756                         }
2757 #ifdef GPT_ROOT_NATIVE
2758                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2759
2760                                 if (root && nr >= root_nr)
2761                                         continue;
2762
2763                                 root_nr = nr;
2764                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2765
2766                                 r = free_and_strdup(&root, node);
2767                                 if (r < 0)
2768                                         return log_oom();
2769                         }
2770 #endif
2771 #ifdef GPT_ROOT_SECONDARY
2772                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2773
2774                                 if (secondary_root && nr >= secondary_root_nr)
2775                                         continue;
2776
2777                                 secondary_root_nr = nr;
2778                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2779
2780                                 r = free_and_strdup(&secondary_root, node);
2781                                 if (r < 0)
2782                                         return log_oom();
2783                         }
2784 #endif
2785
2786                 } else if (is_mbr) {
2787                         int type;
2788
2789                         type = blkid_partition_get_type(pp);
2790                         if (type != 0x83) /* Linux partition */
2791                                 continue;
2792
2793                         /* Note that there's a certain, intended
2794                          * asymmetry here: while for GPT we simply
2795                          * take the first valid partition and ignore
2796                          * all others of the same type, for MBR we
2797                          * fail if there are multiple suitable
2798                          * partitions. This is because the GPT
2799                          * partition types are defined by us, and
2800                          * hence we can define their lookup semantics,
2801                          * while for the MBR logic we reuse existing
2802                          * definitions, and simply don't want to make
2803                          * out the situation. */
2804
2805                         if (root) {
2806                                 log_error("Identified multiple bootable Linux 0x83 partitions on\n"
2807                                           "    %s\n"
2808                                           PARTITION_TABLE_BLURB, arg_image);
2809                                 return -EINVAL;
2810                         }
2811
2812                         root_nr = nr;
2813
2814                         r = free_and_strdup(&root, node);
2815                         if (r < 0)
2816                                 return log_oom();
2817                 }
2818         }
2819
2820         if (!root && !secondary_root) {
2821                 log_error("Failed to identify root partition in disk image\n"
2822                           "    %s\n"
2823                           PARTITION_TABLE_BLURB, arg_image);
2824                 return -EINVAL;
2825         }
2826
2827         if (root) {
2828                 *root_device = root;
2829                 root = NULL;
2830
2831                 *root_device_rw = root_rw;
2832                 *secondary = false;
2833         } else if (secondary_root) {
2834                 *root_device = secondary_root;
2835                 secondary_root = NULL;
2836
2837                 *root_device_rw = secondary_root_rw;
2838                 *secondary = true;
2839         }
2840
2841         if (home) {
2842                 *home_device = home;
2843                 home = NULL;
2844
2845                 *home_device_rw = home_rw;
2846         }
2847
2848         if (srv) {
2849                 *srv_device = srv;
2850                 srv = NULL;
2851
2852                 *srv_device_rw = srv_rw;
2853         }
2854
2855         return 0;
2856 #else
2857         log_error("--image= is not supported, compiled without blkid support.");
2858         return -ENOTSUP;
2859 #endif
2860 }
2861
2862 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2863 #ifdef HAVE_BLKID
2864         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2865         const char *fstype, *p;
2866         int r;
2867
2868         assert(what);
2869         assert(where);
2870
2871         if (arg_read_only)
2872                 rw = false;
2873
2874         if (directory)
2875                 p = strappenda(where, directory);
2876         else
2877                 p = where;
2878
2879         errno = 0;
2880         b = blkid_new_probe_from_filename(what);
2881         if (!b) {
2882                 if (errno == 0)
2883                         return log_oom();
2884                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2885                 return -errno;
2886         }
2887
2888         blkid_probe_enable_superblocks(b, 1);
2889         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2890
2891         errno = 0;
2892         r = blkid_do_safeprobe(b);
2893         if (r == -1 || r == 1) {
2894                 log_error("Cannot determine file system type of %s", what);
2895                 return -EINVAL;
2896         } else if (r != 0) {
2897                 if (errno == 0)
2898                         errno = EIO;
2899                 log_error_errno(errno, "Failed to probe %s: %m", what);
2900                 return -errno;
2901         }
2902
2903         errno = 0;
2904         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2905                 if (errno == 0)
2906                         errno = EINVAL;
2907                 log_error("Failed to determine file system type of %s", what);
2908                 return -errno;
2909         }
2910
2911         if (streq(fstype, "crypto_LUKS")) {
2912                 log_error("nspawn currently does not support LUKS disk images.");
2913                 return -ENOTSUP;
2914         }
2915
2916         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2917                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2918
2919         return 0;
2920 #else
2921         log_error("--image= is not supported, compiled without blkid support.");
2922         return -ENOTSUP;
2923 #endif
2924 }
2925
2926 static int mount_devices(
2927                 const char *where,
2928                 const char *root_device, bool root_device_rw,
2929                 const char *home_device, bool home_device_rw,
2930                 const char *srv_device, bool srv_device_rw) {
2931         int r;
2932
2933         assert(where);
2934
2935         if (root_device) {
2936                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2937                 if (r < 0)
2938                         return log_error_errno(r, "Failed to mount root directory: %m");
2939         }
2940
2941         if (home_device) {
2942                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2943                 if (r < 0)
2944                         return log_error_errno(r, "Failed to mount home directory: %m");
2945         }
2946
2947         if (srv_device) {
2948                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2949                 if (r < 0)
2950                         return log_error_errno(r, "Failed to mount server data directory: %m");
2951         }
2952
2953         return 0;
2954 }
2955
2956 static void loop_remove(int nr, int *image_fd) {
2957         _cleanup_close_ int control = -1;
2958         int r;
2959
2960         if (nr < 0)
2961                 return;
2962
2963         if (image_fd && *image_fd >= 0) {
2964                 r = ioctl(*image_fd, LOOP_CLR_FD);
2965                 if (r < 0)
2966                         log_debug_errno(errno, "Failed to close loop image: %m");
2967                 *image_fd = safe_close(*image_fd);
2968         }
2969
2970         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2971         if (control < 0) {
2972                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2973                 return;
2974         }
2975
2976         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2977         if (r < 0)
2978                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
2979 }
2980
2981 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2982         int pipe_fds[2];
2983         pid_t pid;
2984
2985         assert(database);
2986         assert(key);
2987         assert(rpid);
2988
2989         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2990                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2991
2992         pid = fork();
2993         if (pid < 0)
2994                 return log_error_errno(errno, "Failed to fork getent child: %m");
2995         else if (pid == 0) {
2996                 int nullfd;
2997                 char *empty_env = NULL;
2998
2999                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3000                         _exit(EXIT_FAILURE);
3001
3002                 if (pipe_fds[0] > 2)
3003                         safe_close(pipe_fds[0]);
3004                 if (pipe_fds[1] > 2)
3005                         safe_close(pipe_fds[1]);
3006
3007                 nullfd = open("/dev/null", O_RDWR);
3008                 if (nullfd < 0)
3009                         _exit(EXIT_FAILURE);
3010
3011                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3012                         _exit(EXIT_FAILURE);
3013
3014                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3015                         _exit(EXIT_FAILURE);
3016
3017                 if (nullfd > 2)
3018                         safe_close(nullfd);
3019
3020                 reset_all_signal_handlers();
3021                 close_all_fds(NULL, 0);
3022
3023                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3024                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3025                 _exit(EXIT_FAILURE);
3026         }
3027
3028         pipe_fds[1] = safe_close(pipe_fds[1]);
3029
3030         *rpid = pid;
3031
3032         return pipe_fds[0];
3033 }
3034
3035 static int change_uid_gid(char **_home) {
3036         char line[LINE_MAX], *x, *u, *g, *h;
3037         const char *word, *state;
3038         _cleanup_free_ uid_t *uids = NULL;
3039         _cleanup_free_ char *home = NULL;
3040         _cleanup_fclose_ FILE *f = NULL;
3041         _cleanup_close_ int fd = -1;
3042         unsigned n_uids = 0;
3043         size_t sz = 0, l;
3044         uid_t uid;
3045         gid_t gid;
3046         pid_t pid;
3047         int r;
3048
3049         assert(_home);
3050
3051         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3052                 /* Reset everything fully to 0, just in case */
3053
3054                 if (setgroups(0, NULL) < 0)
3055                         return log_error_errno(errno, "setgroups() failed: %m");
3056
3057                 if (setresgid(0, 0, 0) < 0)
3058                         return log_error_errno(errno, "setregid() failed: %m");
3059
3060                 if (setresuid(0, 0, 0) < 0)
3061                         return log_error_errno(errno, "setreuid() failed: %m");
3062
3063                 *_home = NULL;
3064                 return 0;
3065         }
3066
3067         /* First, get user credentials */
3068         fd = spawn_getent("passwd", arg_user, &pid);
3069         if (fd < 0)
3070                 return fd;
3071
3072         f = fdopen(fd, "r");
3073         if (!f)
3074                 return log_oom();
3075         fd = -1;
3076
3077         if (!fgets(line, sizeof(line), f)) {
3078
3079                 if (!ferror(f)) {
3080                         log_error("Failed to resolve user %s.", arg_user);
3081                         return -ESRCH;
3082                 }
3083
3084                 log_error_errno(errno, "Failed to read from getent: %m");
3085                 return -errno;
3086         }
3087
3088         truncate_nl(line);
3089
3090         wait_for_terminate_and_warn("getent passwd", pid, true);
3091
3092         x = strchr(line, ':');
3093         if (!x) {
3094                 log_error("/etc/passwd entry has invalid user field.");
3095                 return -EIO;
3096         }
3097
3098         u = strchr(x+1, ':');
3099         if (!u) {
3100                 log_error("/etc/passwd entry has invalid password field.");
3101                 return -EIO;
3102         }
3103
3104         u++;
3105         g = strchr(u, ':');
3106         if (!g) {
3107                 log_error("/etc/passwd entry has invalid UID field.");
3108                 return -EIO;
3109         }
3110
3111         *g = 0;
3112         g++;
3113         x = strchr(g, ':');
3114         if (!x) {
3115                 log_error("/etc/passwd entry has invalid GID field.");
3116                 return -EIO;
3117         }
3118
3119         *x = 0;
3120         h = strchr(x+1, ':');
3121         if (!h) {
3122                 log_error("/etc/passwd entry has invalid GECOS field.");
3123                 return -EIO;
3124         }
3125
3126         h++;
3127         x = strchr(h, ':');
3128         if (!x) {
3129                 log_error("/etc/passwd entry has invalid home directory field.");
3130                 return -EIO;
3131         }
3132
3133         *x = 0;
3134
3135         r = parse_uid(u, &uid);
3136         if (r < 0) {
3137                 log_error("Failed to parse UID of user.");
3138                 return -EIO;
3139         }
3140
3141         r = parse_gid(g, &gid);
3142         if (r < 0) {
3143                 log_error("Failed to parse GID of user.");
3144                 return -EIO;
3145         }
3146
3147         home = strdup(h);
3148         if (!home)
3149                 return log_oom();
3150
3151         /* Second, get group memberships */
3152         fd = spawn_getent("initgroups", arg_user, &pid);
3153         if (fd < 0)
3154                 return fd;
3155
3156         fclose(f);
3157         f = fdopen(fd, "r");
3158         if (!f)
3159                 return log_oom();
3160         fd = -1;
3161
3162         if (!fgets(line, sizeof(line), f)) {
3163                 if (!ferror(f)) {
3164                         log_error("Failed to resolve user %s.", arg_user);
3165                         return -ESRCH;
3166                 }
3167
3168                 log_error_errno(errno, "Failed to read from getent: %m");
3169                 return -errno;
3170         }
3171
3172         truncate_nl(line);
3173
3174         wait_for_terminate_and_warn("getent initgroups", pid, true);
3175
3176         /* Skip over the username and subsequent separator whitespace */
3177         x = line;
3178         x += strcspn(x, WHITESPACE);
3179         x += strspn(x, WHITESPACE);
3180
3181         FOREACH_WORD(word, l, x, state) {
3182                 char c[l+1];
3183
3184                 memcpy(c, word, l);
3185                 c[l] = 0;
3186
3187                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3188                         return log_oom();
3189
3190                 r = parse_uid(c, &uids[n_uids++]);
3191                 if (r < 0) {
3192                         log_error("Failed to parse group data from getent.");
3193                         return -EIO;
3194                 }
3195         }
3196
3197         r = mkdir_parents(home, 0775);
3198         if (r < 0)
3199                 return log_error_errno(r, "Failed to make home root directory: %m");
3200
3201         r = mkdir_safe(home, 0755, uid, gid);
3202         if (r < 0 && r != -EEXIST)
3203                 return log_error_errno(r, "Failed to make home directory: %m");
3204
3205         fchown(STDIN_FILENO, uid, gid);
3206         fchown(STDOUT_FILENO, uid, gid);
3207         fchown(STDERR_FILENO, uid, gid);
3208
3209         if (setgroups(n_uids, uids) < 0)
3210                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3211
3212         if (setresgid(gid, gid, gid) < 0)
3213                 return log_error_errno(errno, "setregid() failed: %m");
3214
3215         if (setresuid(uid, uid, uid) < 0)
3216                 return log_error_errno(errno, "setreuid() failed: %m");
3217
3218         if (_home) {
3219                 *_home = home;
3220                 home = NULL;
3221         }
3222
3223         return 0;
3224 }
3225
3226 /*
3227  * Return values:
3228  * < 0 : wait_for_terminate() failed to get the state of the
3229  *       container, the container was terminated by a signal, or
3230  *       failed for an unknown reason.  No change is made to the
3231  *       container argument.
3232  * > 0 : The program executed in the container terminated with an
3233  *       error.  The exit code of the program executed in the
3234  *       container is returned.  The container argument has been set
3235  *       to CONTAINER_TERMINATED.
3236  *   0 : The container is being rebooted, has been shut down or exited
3237  *       successfully.  The container argument has been set to either
3238  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3239  *
3240  * That is, success is indicated by a return value of zero, and an
3241  * error is indicated by a non-zero value.
3242  */
3243 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3244         siginfo_t status;
3245         int r;
3246
3247         r = wait_for_terminate(pid, &status);
3248         if (r < 0)
3249                 return log_warning_errno(r, "Failed to wait for container: %m");
3250
3251         switch (status.si_code) {
3252
3253         case CLD_EXITED:
3254                 if (status.si_status == 0) {
3255                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3256
3257                 } else
3258                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3259
3260                 *container = CONTAINER_TERMINATED;
3261                 return status.si_status;
3262
3263         case CLD_KILLED:
3264                 if (status.si_status == SIGINT) {
3265
3266                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3267                         *container = CONTAINER_TERMINATED;
3268                         return 0;
3269
3270                 } else if (status.si_status == SIGHUP) {
3271
3272                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3273                         *container = CONTAINER_REBOOTED;
3274                         return 0;
3275                 }
3276
3277                 /* CLD_KILLED fallthrough */
3278
3279         case CLD_DUMPED:
3280                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3281                 return -EIO;
3282
3283         default:
3284                 log_error("Container %s failed due to unknown reason.", arg_machine);
3285                 return -EIO;
3286         }
3287
3288         return r;
3289 }
3290
3291 static void nop_handler(int sig) {}
3292
3293 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3294         pid_t pid;
3295
3296         pid = PTR_TO_UINT32(userdata);
3297         if (pid > 0) {
3298                 if (kill(pid, SIGRTMIN+3) >= 0) {
3299                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3300                         sd_event_source_set_userdata(s, NULL);
3301                         return 0;
3302                 }
3303         }
3304
3305         sd_event_exit(sd_event_source_get_event(s), 0);
3306         return 0;
3307 }
3308
3309 static int determine_names(void) {
3310         int r;
3311
3312         if (!arg_image && !arg_directory) {
3313                 if (arg_machine) {
3314                         _cleanup_(image_unrefp) Image *i = NULL;
3315
3316                         r = image_find(arg_machine, &i);
3317                         if (r < 0)
3318                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3319                         else if (r == 0) {
3320                                 log_error("No image for machine '%s': %m", arg_machine);
3321                                 return -ENOENT;
3322                         }
3323
3324                         if (i->type == IMAGE_GPT)
3325                                 r = set_sanitized_path(&arg_image, i->path);
3326                         else
3327                                 r = set_sanitized_path(&arg_directory, i->path);
3328                         if (r < 0)
3329                                 return log_error_errno(r, "Invalid image directory: %m");
3330
3331                         arg_read_only = arg_read_only || i->read_only;
3332                 } else
3333                         arg_directory = get_current_dir_name();
3334
3335                 if (!arg_directory && !arg_machine) {
3336                         log_error("Failed to determine path, please use -D or -i.");
3337                         return -EINVAL;
3338                 }
3339         }
3340
3341         if (!arg_machine) {
3342                 if (arg_directory && path_equal(arg_directory, "/"))
3343                         arg_machine = gethostname_malloc();
3344                 else
3345                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3346
3347                 if (!arg_machine)
3348                         return log_oom();
3349
3350                 hostname_cleanup(arg_machine, false);
3351                 if (!machine_name_is_valid(arg_machine)) {
3352                         log_error("Failed to determine machine name automatically, please use -M.");
3353                         return -EINVAL;
3354                 }
3355
3356                 if (arg_ephemeral) {
3357                         char *b;
3358
3359                         /* Add a random suffix when this is an
3360                          * ephemeral machine, so that we can run many
3361                          * instances at once without manually having
3362                          * to specify -M each time. */
3363
3364                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3365                                 return log_oom();
3366
3367                         free(arg_machine);
3368                         arg_machine = b;
3369                 }
3370         }
3371
3372         return 0;
3373 }
3374
3375 int main(int argc, char *argv[]) {
3376
3377         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3378         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3379         _cleanup_close_ int master = -1, image_fd = -1;
3380         _cleanup_fdset_free_ FDSet *fds = NULL;
3381         int r, n_fd_passed, loop_nr = -1;
3382         char veth_name[IFNAMSIZ];
3383         bool secondary = false, remove_subvol = false;
3384         sigset_t mask, mask_chld;
3385         pid_t pid = 0;
3386         int ret = EXIT_SUCCESS;
3387         union in_addr_union exposed = {};
3388         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3389
3390         log_parse_environment();
3391         log_open();
3392
3393         r = parse_argv(argc, argv);
3394         if (r <= 0)
3395                 goto finish;
3396
3397         r = determine_names();
3398         if (r < 0)
3399                 goto finish;
3400
3401         if (geteuid() != 0) {
3402                 log_error("Need to be root.");
3403                 r = -EPERM;
3404                 goto finish;
3405         }
3406
3407         if (sd_booted() <= 0) {
3408                 log_error("Not running on a systemd system.");
3409                 r = -EINVAL;
3410                 goto finish;
3411         }
3412
3413         log_close();
3414         n_fd_passed = sd_listen_fds(false);
3415         if (n_fd_passed > 0) {
3416                 r = fdset_new_listen_fds(&fds, false);
3417                 if (r < 0) {
3418                         log_error_errno(r, "Failed to collect file descriptors: %m");
3419                         goto finish;
3420                 }
3421         }
3422         fdset_close_others(fds);
3423         log_open();
3424
3425         if (arg_directory) {
3426                 assert(!arg_image);
3427
3428                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3429                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3430                         r = -EINVAL;
3431                         goto finish;
3432                 }
3433
3434                 if (arg_ephemeral) {
3435                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3436                         char *np;
3437
3438                         /* If the specified path is a mount point we
3439                          * generate the new snapshot immediately
3440                          * inside it under a random name. However if
3441                          * the specified is not a mount point we
3442                          * create the new snapshot in the parent
3443                          * directory, just next to it. */
3444                         r = path_is_mount_point(arg_directory, false);
3445                         if (r < 0) {
3446                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3447                                 goto finish;
3448                         }
3449                         if (r > 0)
3450                                 r = tempfn_random_child(arg_directory, &np);
3451                         else
3452                                 r = tempfn_random(arg_directory, &np);
3453                         if (r < 0) {
3454                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3455                                 goto finish;
3456                         }
3457
3458                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3459                         if (r < 0) {
3460                                 log_error_errno(r, "Failed to lock %s: %m", np);
3461                                 goto finish;
3462                         }
3463
3464                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3465                         if (r < 0) {
3466                                 free(np);
3467                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3468                                 goto finish;
3469                         }
3470
3471                         free(arg_directory);
3472                         arg_directory = np;
3473
3474                         remove_subvol = true;
3475
3476                 } else {
3477                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3478                         if (r == -EBUSY) {
3479                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3480                                 goto finish;
3481                         }
3482                         if (r < 0) {
3483                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3484                                 return r;
3485                         }
3486
3487                         if (arg_template) {
3488                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3489                                 if (r == -EEXIST) {
3490                                         if (!arg_quiet)
3491                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3492                                 } else if (r < 0) {
3493                                         log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3494                                         goto finish;
3495                                 } else {
3496                                         if (!arg_quiet)
3497                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3498                                 }
3499                         }
3500                 }
3501
3502                 if (arg_boot) {
3503                         if (path_is_os_tree(arg_directory) <= 0) {
3504                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3505                                 r = -EINVAL;
3506                                 goto finish;
3507                         }
3508                 } else {
3509                         const char *p;
3510
3511                         p = strappenda(arg_directory,
3512                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3513                         if (access(p, F_OK) < 0) {
3514                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3515                                 r = -EINVAL;
3516                                 goto finish;
3517                         }
3518                 }
3519
3520         } else {
3521                 char template[] = "/tmp/nspawn-root-XXXXXX";
3522
3523                 assert(arg_image);
3524                 assert(!arg_template);
3525
3526                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3527                 if (r == -EBUSY) {
3528                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3529                         goto finish;
3530                 }
3531                 if (r < 0) {
3532                         r = log_error_errno(r, "Failed to create image lock: %m");
3533                         goto finish;
3534                 }
3535
3536                 if (!mkdtemp(template)) {
3537                         log_error_errno(errno, "Failed to create temporary directory: %m");
3538                         r = -errno;
3539                         goto finish;
3540                 }
3541
3542                 arg_directory = strdup(template);
3543                 if (!arg_directory) {
3544                         r = log_oom();
3545                         goto finish;
3546                 }
3547
3548                 image_fd = setup_image(&device_path, &loop_nr);
3549                 if (image_fd < 0) {
3550                         r = image_fd;
3551                         goto finish;
3552                 }
3553
3554                 r = dissect_image(image_fd,
3555                                   &root_device, &root_device_rw,
3556                                   &home_device, &home_device_rw,
3557                                   &srv_device, &srv_device_rw,
3558                                   &secondary);
3559                 if (r < 0)
3560                         goto finish;
3561         }
3562
3563         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3564         if (master < 0) {
3565                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3566                 goto finish;
3567         }
3568
3569         r = ptsname_malloc(master, &console);
3570         if (r < 0) {
3571                 r = log_error_errno(r, "Failed to determine tty name: %m");
3572                 goto finish;
3573         }
3574
3575         if (!arg_quiet)
3576                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3577                          arg_machine, arg_image ?: arg_directory);
3578
3579         if (unlockpt(master) < 0) {
3580                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3581                 goto finish;
3582         }
3583
3584         assert_se(sigemptyset(&mask) == 0);
3585         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3586         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3587
3588         assert_se(sigemptyset(&mask_chld) == 0);
3589         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3590
3591         for (;;) {
3592                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3593                 ContainerStatus container_status;
3594                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3595                 struct sigaction sa = {
3596                         .sa_handler = nop_handler,
3597                         .sa_flags = SA_NOCLDSTOP,
3598                 };
3599
3600                 r = barrier_create(&barrier);
3601                 if (r < 0) {
3602                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3603                         goto finish;
3604                 }
3605
3606                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3607                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3608                         goto finish;
3609                 }
3610
3611                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3612                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3613                         goto finish;
3614                 }
3615
3616                 /* Child can be killed before execv(), so handle SIGCHLD
3617                  * in order to interrupt parent's blocking calls and
3618                  * give it a chance to call wait() and terminate. */
3619                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3620                 if (r < 0) {
3621                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3622                         goto finish;
3623                 }
3624
3625                 r = sigaction(SIGCHLD, &sa, NULL);
3626                 if (r < 0) {
3627                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3628                         goto finish;
3629                 }
3630
3631                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3632                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3633                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3634                 if (pid < 0) {
3635                         if (errno == EINVAL)
3636                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3637                         else
3638                                 r = log_error_errno(errno, "clone() failed: %m");
3639
3640                         goto finish;
3641                 }
3642
3643                 if (pid == 0) {
3644                         /* child */
3645                         _cleanup_free_ char *home = NULL;
3646                         unsigned n_env = 2;
3647                         const char *envp[] = {
3648                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3649                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3650                                 NULL, /* TERM */
3651                                 NULL, /* HOME */
3652                                 NULL, /* USER */
3653                                 NULL, /* LOGNAME */
3654                                 NULL, /* container_uuid */
3655                                 NULL, /* LISTEN_FDS */
3656                                 NULL, /* LISTEN_PID */
3657                                 NULL
3658                         };
3659                         char **env_use;
3660
3661                         barrier_set_role(&barrier, BARRIER_CHILD);
3662
3663                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3664                         if (envp[n_env])
3665                                 n_env ++;
3666
3667                         master = safe_close(master);
3668
3669                         close_nointr(STDIN_FILENO);
3670                         close_nointr(STDOUT_FILENO);
3671                         close_nointr(STDERR_FILENO);
3672
3673                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3674                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3675
3676                         reset_all_signal_handlers();
3677                         reset_signal_mask();
3678
3679                         r = open_terminal(console, O_RDWR);
3680                         if (r != STDIN_FILENO) {
3681                                 if (r >= 0) {
3682                                         safe_close(r);
3683                                         r = -EINVAL;
3684                                 }
3685
3686                                 log_error_errno(r, "Failed to open console: %m");
3687                                 _exit(EXIT_FAILURE);
3688                         }
3689
3690                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3691                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3692                                 log_error_errno(errno, "Failed to duplicate console: %m");
3693                                 _exit(EXIT_FAILURE);
3694                         }
3695
3696                         if (setsid() < 0) {
3697                                 log_error_errno(errno, "setsid() failed: %m");
3698                                 _exit(EXIT_FAILURE);
3699                         }
3700
3701                         if (reset_audit_loginuid() < 0)
3702                                 _exit(EXIT_FAILURE);
3703
3704                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3705                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3706                                 _exit(EXIT_FAILURE);
3707                         }
3708
3709                         /* Mark everything as slave, so that we still
3710                          * receive mounts from the real root, but don't
3711                          * propagate mounts to the real root. */
3712                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3713                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3714                                 _exit(EXIT_FAILURE);
3715                         }
3716
3717                         if (mount_devices(arg_directory,
3718                                           root_device, root_device_rw,
3719                                           home_device, home_device_rw,
3720                                           srv_device, srv_device_rw) < 0)
3721                                 _exit(EXIT_FAILURE);
3722
3723                         /* Turn directory into bind mount */
3724                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3725                                 log_error_errno(errno, "Failed to make bind mount: %m");
3726                                 _exit(EXIT_FAILURE);
3727                         }
3728
3729                         r = setup_volatile(arg_directory);
3730                         if (r < 0)
3731                                 _exit(EXIT_FAILURE);
3732
3733                         if (setup_volatile_state(arg_directory) < 0)
3734                                 _exit(EXIT_FAILURE);
3735
3736                         r = base_filesystem_create(arg_directory);
3737                         if (r < 0)
3738                                 _exit(EXIT_FAILURE);
3739
3740                         if (arg_read_only) {
3741                                 r = bind_remount_recursive(arg_directory, true);
3742                                 if (r < 0) {
3743                                         log_error_errno(r, "Failed to make tree read-only: %m");
3744                                         _exit(EXIT_FAILURE);
3745                                 }
3746                         }
3747
3748                         if (mount_all(arg_directory) < 0)
3749                                 _exit(EXIT_FAILURE);
3750
3751                         if (copy_devnodes(arg_directory) < 0)
3752                                 _exit(EXIT_FAILURE);
3753
3754                         if (setup_ptmx(arg_directory) < 0)
3755                                 _exit(EXIT_FAILURE);
3756
3757                         dev_setup(arg_directory);
3758
3759                         if (setup_propagate(arg_directory) < 0)
3760                                 _exit(EXIT_FAILURE);
3761
3762                         if (setup_seccomp() < 0)
3763                                 _exit(EXIT_FAILURE);
3764
3765                         if (setup_dev_console(arg_directory, console) < 0)
3766                                 _exit(EXIT_FAILURE);
3767
3768                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3769                                 _exit(EXIT_FAILURE);
3770                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3771
3772                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3773                                 _exit(EXIT_FAILURE);
3774                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3775
3776                         /* Tell the parent that we are ready, and that
3777                          * it can cgroupify us to that we lack access
3778                          * to certain devices and resources. */
3779                         (void) barrier_place(&barrier);
3780
3781                         if (setup_boot_id(arg_directory) < 0)
3782                                 _exit(EXIT_FAILURE);
3783
3784                         if (setup_timezone(arg_directory) < 0)
3785                                 _exit(EXIT_FAILURE);
3786
3787                         if (setup_resolv_conf(arg_directory) < 0)
3788                                 _exit(EXIT_FAILURE);
3789
3790                         if (setup_journal(arg_directory) < 0)
3791                                 _exit(EXIT_FAILURE);
3792
3793                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3794                                 _exit(EXIT_FAILURE);
3795
3796                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3797                                 _exit(EXIT_FAILURE);
3798
3799                         if (mount_tmpfs(arg_directory) < 0)
3800                                 _exit(EXIT_FAILURE);
3801
3802                         /* Wait until we are cgroup-ified, so that we
3803                          * can mount the right cgroup path writable */
3804                         (void) barrier_sync_next(&barrier);
3805
3806                         if (mount_cgroup(arg_directory) < 0)
3807                                 _exit(EXIT_FAILURE);
3808
3809                         if (chdir(arg_directory) < 0) {
3810                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3811                                 _exit(EXIT_FAILURE);
3812                         }
3813
3814                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3815                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3816                                 _exit(EXIT_FAILURE);
3817                         }
3818
3819                         if (chroot(".") < 0) {
3820                                 log_error_errno(errno, "chroot() failed: %m");
3821                                 _exit(EXIT_FAILURE);
3822                         }
3823
3824                         if (chdir("/") < 0) {
3825                                 log_error_errno(errno, "chdir() failed: %m");
3826                                 _exit(EXIT_FAILURE);
3827                         }
3828
3829                         umask(0022);
3830
3831                         if (arg_private_network)
3832                                 loopback_setup();
3833
3834                         if (drop_capabilities() < 0) {
3835                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3836                                 _exit(EXIT_FAILURE);
3837                         }
3838
3839                         r = change_uid_gid(&home);
3840                         if (r < 0)
3841                                 _exit(EXIT_FAILURE);
3842
3843                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3844                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3845                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3846                                 log_oom();
3847                                 _exit(EXIT_FAILURE);
3848                         }
3849
3850                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3851                                 char as_uuid[37];
3852
3853                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3854                                         log_oom();
3855                                         _exit(EXIT_FAILURE);
3856                                 }
3857                         }
3858
3859                         if (fdset_size(fds) > 0) {
3860                                 r = fdset_cloexec(fds, false);
3861                                 if (r < 0) {
3862                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3863                                         _exit(EXIT_FAILURE);
3864                                 }
3865
3866                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3867                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3868                                         log_oom();
3869                                         _exit(EXIT_FAILURE);
3870                                 }
3871                         }
3872
3873                         setup_hostname();
3874
3875                         if (arg_personality != 0xffffffffLU) {
3876                                 if (personality(arg_personality) < 0) {
3877                                         log_error_errno(errno, "personality() failed: %m");
3878                                         _exit(EXIT_FAILURE);
3879                                 }
3880                         } else if (secondary) {
3881                                 if (personality(PER_LINUX32) < 0) {
3882                                         log_error_errno(errno, "personality() failed: %m");
3883                                         _exit(EXIT_FAILURE);
3884                                 }
3885                         }
3886
3887 #ifdef HAVE_SELINUX
3888                         if (arg_selinux_context)
3889                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3890                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3891                                         _exit(EXIT_FAILURE);
3892                                 }
3893 #endif
3894
3895                         if (!strv_isempty(arg_setenv)) {
3896                                 char **n;
3897
3898                                 n = strv_env_merge(2, envp, arg_setenv);
3899                                 if (!n) {
3900                                         log_oom();
3901                                         _exit(EXIT_FAILURE);
3902                                 }
3903
3904                                 env_use = n;
3905                         } else
3906                                 env_use = (char**) envp;
3907
3908                         /* Wait until the parent is ready with the setup, too... */
3909                         if (!barrier_place_and_sync(&barrier))
3910                                 _exit(EXIT_FAILURE);
3911
3912                         if (arg_boot) {
3913                                 char **a;
3914                                 size_t l;
3915
3916                                 /* Automatically search for the init system */
3917
3918                                 l = 1 + argc - optind;
3919                                 a = newa(char*, l + 1);
3920                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3921
3922                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3923                                 execve(a[0], a, env_use);
3924
3925                                 a[0] = (char*) "/lib/systemd/systemd";
3926                                 execve(a[0], a, env_use);
3927
3928                                 a[0] = (char*) "/sbin/init";
3929                                 execve(a[0], a, env_use);
3930                         } else if (argc > optind)
3931                                 execvpe(argv[optind], argv + optind, env_use);
3932                         else {
3933                                 chdir(home ? home : "/root");
3934                                 execle("/bin/bash", "-bash", NULL, env_use);
3935                                 execle("/bin/sh", "-sh", NULL, env_use);
3936                         }
3937
3938                         log_error_errno(errno, "execv() failed: %m");
3939                         _exit(EXIT_FAILURE);
3940                 }
3941
3942                 barrier_set_role(&barrier, BARRIER_PARENT);
3943                 fdset_free(fds);
3944                 fds = NULL;
3945
3946                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3947                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3948
3949                 /* Wait for the most basic Child-setup to be done,
3950                  * before we add hardware to it, and place it in a
3951                  * cgroup. */
3952                 if (barrier_sync_next(&barrier)) {
3953                         int ifi = 0;
3954
3955                         r = move_network_interfaces(pid);
3956                         if (r < 0)
3957                                 goto finish;
3958
3959                         r = setup_veth(pid, veth_name, &ifi);
3960                         if (r < 0)
3961                                 goto finish;
3962
3963                         r = setup_bridge(veth_name, &ifi);
3964                         if (r < 0)
3965                                 goto finish;
3966
3967                         r = setup_macvlan(pid);
3968                         if (r < 0)
3969                                 goto finish;
3970
3971                         r = register_machine(pid, ifi);
3972                         if (r < 0)
3973                                 goto finish;
3974
3975                         /* Block SIGCHLD here, before notifying child.
3976                          * process_pty() will handle it with the other signals. */
3977                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3978                         if (r < 0)
3979                                 goto finish;
3980
3981                         /* Reset signal to default */
3982                         r = default_signals(SIGCHLD, -1);
3983                         if (r < 0)
3984                                 goto finish;
3985
3986                         /* Notify the child that the parent is ready with all
3987                          * its setup, and that the child can now hand over
3988                          * control to the code to run inside the container. */
3989                         (void) barrier_place(&barrier);
3990
3991                         /* And wait that the child is completely ready now. */
3992                         if (barrier_place_and_sync(&barrier)) {
3993                                 _cleanup_event_unref_ sd_event *event = NULL;
3994                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3995                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
3996                                 char last_char = 0;
3997
3998                                 sd_notifyf(false,
3999                                            "READY=1\n"
4000                                            "STATUS=Container running.\n"
4001                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4002
4003                                 r = sd_event_new(&event);
4004                                 if (r < 0) {
4005                                         log_error_errno(r, "Failed to get default event source: %m");
4006                                         goto finish;
4007                                 }
4008
4009                                 if (arg_boot) {
4010                                         /* Try to kill the init system on SIGINT or SIGTERM */
4011                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4012                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4013                                 } else {
4014                                         /* Immediately exit */
4015                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4016                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4017                                 }
4018
4019                                 /* simply exit on sigchld */
4020                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4021
4022                                 if (arg_expose_ports) {
4023                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4024                                         if (r < 0)
4025                                                 goto finish;
4026
4027                                         (void) expose_ports(rtnl, &exposed);
4028                                 }
4029
4030                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4031
4032                                 r = pty_forward_new(event, master, true, &forward);
4033                                 if (r < 0) {
4034                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4035                                         goto finish;
4036                                 }
4037
4038                                 r = sd_event_loop(event);
4039                                 if (r < 0) {
4040                                         log_error_errno(r, "Failed to run event loop: %m");
4041                                         goto finish;
4042                                 }
4043
4044                                 pty_forward_get_last_char(forward, &last_char);
4045
4046                                 forward = pty_forward_free(forward);
4047
4048                                 if (!arg_quiet && last_char != '\n')
4049                                         putc('\n', stdout);
4050
4051                                 /* Kill if it is not dead yet anyway */
4052                                 terminate_machine(pid);
4053                         }
4054                 }
4055
4056                 /* Normally redundant, but better safe than sorry */
4057                 kill(pid, SIGKILL);
4058
4059                 r = wait_for_container(pid, &container_status);
4060                 pid = 0;
4061
4062                 if (r < 0)
4063                         /* We failed to wait for the container, or the
4064                          * container exited abnormally */
4065                         goto finish;
4066                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4067                         /* The container exited with a non-zero
4068                          * status, or with zero status and no reboot
4069                          * was requested. */
4070                         ret = r;
4071                         break;
4072                 }
4073
4074                 /* CONTAINER_REBOOTED, loop again */
4075
4076                 if (arg_keep_unit) {
4077                         /* Special handling if we are running as a
4078                          * service: instead of simply restarting the
4079                          * machine we want to restart the entire
4080                          * service, so let's inform systemd about this
4081                          * with the special exit code 133. The service
4082                          * file uses RestartForceExitStatus=133 so
4083                          * that this results in a full nspawn
4084                          * restart. This is necessary since we might
4085                          * have cgroup parameters set we want to have
4086                          * flushed out. */
4087                         ret = 133;
4088                         r = 0;
4089                         break;
4090                 }
4091
4092                 flush_ports(&exposed);
4093         }
4094
4095 finish:
4096         sd_notify(false,
4097                   "STOPPING=1\n"
4098                   "STATUS=Terminating...");
4099
4100         loop_remove(loop_nr, &image_fd);
4101
4102         if (pid > 0)
4103                 kill(pid, SIGKILL);
4104
4105         if (remove_subvol && arg_directory) {
4106                 int k;
4107
4108                 k = btrfs_subvol_remove(arg_directory);
4109                 if (k < 0)
4110                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4111         }
4112
4113         if (arg_machine) {
4114                 const char *p;
4115
4116                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4117                 (void) rm_rf(p, false, true, false);
4118         }
4119
4120         free(arg_directory);
4121         free(arg_template);
4122         free(arg_image);
4123         free(arg_machine);
4124         free(arg_user);
4125         strv_free(arg_setenv);
4126         strv_free(arg_network_interfaces);
4127         strv_free(arg_network_macvlan);
4128         strv_free(arg_bind);
4129         strv_free(arg_bind_ro);
4130         strv_free(arg_tmpfs);
4131
4132         flush_ports(&exposed);
4133
4134         while (arg_expose_ports) {
4135                 ExposePort *p = arg_expose_ports;
4136                 LIST_REMOVE(ports, arg_expose_ports, p);
4137                 free(p);
4138         }
4139
4140         return r < 0 ? EXIT_FAILURE : ret;
4141 }