chiark / gitweb /
inspawn: wait until udev has probed a loopback device before making us of it
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static bool arg_network_veth = false;
184 static const char *arg_network_bridge = NULL;
185 static unsigned long arg_personality = 0xffffffffLU;
186 static char *arg_image = NULL;
187 static Volatile arg_volatile = VOLATILE_NO;
188 static ExposePort *arg_expose_ports = NULL;
189
190 static void help(void) {
191         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
192                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
193                "  -h --help                 Show this help\n"
194                "     --version              Print version string\n"
195                "  -q --quiet                Do not show status information\n"
196                "  -D --directory=PATH       Root directory for the container\n"
197                "     --template=PATH        Initialize root directory from template directory,\n"
198                "                            if missing\n"
199                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
200                "                            remove it after exit\n"
201                "  -i --image=PATH           File system device or disk image for the container\n"
202                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
203                "  -u --user=USER            Run the command under specified user or uid\n"
204                "  -M --machine=NAME         Set the machine name for the container\n"
205                "     --uuid=UUID            Set a specific machine UUID for the container\n"
206                "  -S --slice=SLICE          Place the container in the specified slice\n"
207                "     --private-network      Disable network in container\n"
208                "     --network-interface=INTERFACE\n"
209                "                            Assign an existing network interface to the\n"
210                "                            container\n"
211                "     --network-macvlan=INTERFACE\n"
212                "                            Create a macvlan network interface based on an\n"
213                "                            existing network interface to the container\n"
214                "  -n --network-veth         Add a virtual ethernet connection between host\n"
215                "                            and container\n"
216                "     --network-bridge=INTERFACE\n"
217                "                            Add a virtual ethernet connection between host\n"
218                "                            and container and add it to an existing bridge on\n"
219                "                            the host\n"
220                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
221                "                            Expose a container IP port on the host\n"
222                "  -Z --selinux-context=SECLABEL\n"
223                "                            Set the SELinux security context to be used by\n"
224                "                            processes in the container\n"
225                "  -L --selinux-apifs-context=SECLABEL\n"
226                "                            Set the SELinux security context to be used by\n"
227                "                            API/tmpfs file systems in the container\n"
228                "     --capability=CAP       In addition to the default, retain specified\n"
229                "                            capability\n"
230                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
231                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
232                "                            try-guest, try-host\n"
233                "  -j                        Equivalent to --link-journal=try-guest\n"
234                "     --read-only            Mount the root directory read-only\n"
235                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
236                "                            the container\n"
237                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
238                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
239                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
240                "     --share-system         Share system namespaces with host\n"
241                "     --register=BOOLEAN     Register container as machine\n"
242                "     --keep-unit            Do not register a scope for the machine, reuse\n"
243                "                            the service unit nspawn is running in\n"
244                "     --volatile[=MODE]      Run the system in volatile mode\n"
245                , program_invocation_short_name);
246 }
247
248 static int set_sanitized_path(char **b, const char *path) {
249         char *p;
250
251         assert(b);
252         assert(path);
253
254         p = canonicalize_file_name(path);
255         if (!p) {
256                 if (errno != ENOENT)
257                         return -errno;
258
259                 p = path_make_absolute_cwd(path);
260                 if (!p)
261                         return -ENOMEM;
262         }
263
264         free(*b);
265         *b = path_kill_slashes(p);
266         return 0;
267 }
268
269 static int parse_argv(int argc, char *argv[]) {
270
271         enum {
272                 ARG_VERSION = 0x100,
273                 ARG_PRIVATE_NETWORK,
274                 ARG_UUID,
275                 ARG_READ_ONLY,
276                 ARG_CAPABILITY,
277                 ARG_DROP_CAPABILITY,
278                 ARG_LINK_JOURNAL,
279                 ARG_BIND,
280                 ARG_BIND_RO,
281                 ARG_TMPFS,
282                 ARG_SETENV,
283                 ARG_SHARE_SYSTEM,
284                 ARG_REGISTER,
285                 ARG_KEEP_UNIT,
286                 ARG_NETWORK_INTERFACE,
287                 ARG_NETWORK_MACVLAN,
288                 ARG_NETWORK_BRIDGE,
289                 ARG_PERSONALITY,
290                 ARG_VOLATILE,
291                 ARG_TEMPLATE,
292         };
293
294         static const struct option options[] = {
295                 { "help",                  no_argument,       NULL, 'h'                   },
296                 { "version",               no_argument,       NULL, ARG_VERSION           },
297                 { "directory",             required_argument, NULL, 'D'                   },
298                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
299                 { "ephemeral",             no_argument,       NULL, 'x'                   },
300                 { "user",                  required_argument, NULL, 'u'                   },
301                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
302                 { "boot",                  no_argument,       NULL, 'b'                   },
303                 { "uuid",                  required_argument, NULL, ARG_UUID              },
304                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
305                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
306                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
307                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
308                 { "bind",                  required_argument, NULL, ARG_BIND              },
309                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
310                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
311                 { "machine",               required_argument, NULL, 'M'                   },
312                 { "slice",                 required_argument, NULL, 'S'                   },
313                 { "setenv",                required_argument, NULL, ARG_SETENV            },
314                 { "selinux-context",       required_argument, NULL, 'Z'                   },
315                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
316                 { "quiet",                 no_argument,       NULL, 'q'                   },
317                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
318                 { "register",              required_argument, NULL, ARG_REGISTER          },
319                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
320                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
321                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
322                 { "network-veth",          no_argument,       NULL, 'n'                   },
323                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
324                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
325                 { "image",                 required_argument, NULL, 'i'                   },
326                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
327                 { "port",                  required_argument, NULL, 'p'                   },
328                 {}
329         };
330
331         int c, r;
332         uint64_t plus = 0, minus = 0;
333
334         assert(argc >= 0);
335         assert(argv);
336
337         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
338
339                 switch (c) {
340
341                 case 'h':
342                         help();
343                         return 0;
344
345                 case ARG_VERSION:
346                         puts(PACKAGE_STRING);
347                         puts(SYSTEMD_FEATURES);
348                         return 0;
349
350                 case 'D':
351                         r = set_sanitized_path(&arg_directory, optarg);
352                         if (r < 0)
353                                 return log_error_errno(r, "Invalid root directory: %m");
354
355                         break;
356
357                 case ARG_TEMPLATE:
358                         r = set_sanitized_path(&arg_template, optarg);
359                         if (r < 0)
360                                 return log_error_errno(r, "Invalid template directory: %m");
361
362                         break;
363
364                 case 'i':
365                         r = set_sanitized_path(&arg_image, optarg);
366                         if (r < 0)
367                                 return log_error_errno(r, "Invalid image path: %m");
368
369                         break;
370
371                 case 'x':
372                         arg_ephemeral = true;
373                         break;
374
375                 case 'u':
376                         free(arg_user);
377                         arg_user = strdup(optarg);
378                         if (!arg_user)
379                                 return log_oom();
380
381                         break;
382
383                 case ARG_NETWORK_BRIDGE:
384                         arg_network_bridge = optarg;
385
386                         /* fall through */
387
388                 case 'n':
389                         arg_network_veth = true;
390                         arg_private_network = true;
391                         break;
392
393                 case ARG_NETWORK_INTERFACE:
394                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
395                                 return log_oom();
396
397                         arg_private_network = true;
398                         break;
399
400                 case ARG_NETWORK_MACVLAN:
401                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
402                                 return log_oom();
403
404                         /* fall through */
405
406                 case ARG_PRIVATE_NETWORK:
407                         arg_private_network = true;
408                         break;
409
410                 case 'b':
411                         arg_boot = true;
412                         break;
413
414                 case ARG_UUID:
415                         r = sd_id128_from_string(optarg, &arg_uuid);
416                         if (r < 0) {
417                                 log_error("Invalid UUID: %s", optarg);
418                                 return r;
419                         }
420                         break;
421
422                 case 'S':
423                         arg_slice = optarg;
424                         break;
425
426                 case 'M':
427                         if (isempty(optarg)) {
428                                 free(arg_machine);
429                                 arg_machine = NULL;
430                         } else {
431                                 if (!machine_name_is_valid(optarg)) {
432                                         log_error("Invalid machine name: %s", optarg);
433                                         return -EINVAL;
434                                 }
435
436                                 r = free_and_strdup(&arg_machine, optarg);
437                                 if (r < 0)
438                                         return log_oom();
439
440                                 break;
441                         }
442
443                 case 'Z':
444                         arg_selinux_context = optarg;
445                         break;
446
447                 case 'L':
448                         arg_selinux_apifs_context = optarg;
449                         break;
450
451                 case ARG_READ_ONLY:
452                         arg_read_only = true;
453                         break;
454
455                 case ARG_CAPABILITY:
456                 case ARG_DROP_CAPABILITY: {
457                         const char *state, *word;
458                         size_t length;
459
460                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
461                                 _cleanup_free_ char *t;
462
463                                 t = strndup(word, length);
464                                 if (!t)
465                                         return log_oom();
466
467                                 if (streq(t, "all")) {
468                                         if (c == ARG_CAPABILITY)
469                                                 plus = (uint64_t) -1;
470                                         else
471                                                 minus = (uint64_t) -1;
472                                 } else {
473                                         int cap;
474
475                                         cap = capability_from_name(t);
476                                         if (cap < 0) {
477                                                 log_error("Failed to parse capability %s.", t);
478                                                 return -EINVAL;
479                                         }
480
481                                         if (c == ARG_CAPABILITY)
482                                                 plus |= 1ULL << (uint64_t) cap;
483                                         else
484                                                 minus |= 1ULL << (uint64_t) cap;
485                                 }
486                         }
487
488                         break;
489                 }
490
491                 case 'j':
492                         arg_link_journal = LINK_GUEST;
493                         arg_link_journal_try = true;
494                         break;
495
496                 case ARG_LINK_JOURNAL:
497                         if (streq(optarg, "auto")) {
498                                 arg_link_journal = LINK_AUTO;
499                                 arg_link_journal_try = false;
500                         } else if (streq(optarg, "no")) {
501                                 arg_link_journal = LINK_NO;
502                                 arg_link_journal_try = false;
503                         } else if (streq(optarg, "guest")) {
504                                 arg_link_journal = LINK_GUEST;
505                                 arg_link_journal_try = false;
506                         } else if (streq(optarg, "host")) {
507                                 arg_link_journal = LINK_HOST;
508                                 arg_link_journal_try = false;
509                         } else if (streq(optarg, "try-guest")) {
510                                 arg_link_journal = LINK_GUEST;
511                                 arg_link_journal_try = true;
512                         } else if (streq(optarg, "try-host")) {
513                                 arg_link_journal = LINK_HOST;
514                                 arg_link_journal_try = true;
515                         } else {
516                                 log_error("Failed to parse link journal mode %s", optarg);
517                                 return -EINVAL;
518                         }
519
520                         break;
521
522                 case ARG_BIND:
523                 case ARG_BIND_RO: {
524                         _cleanup_free_ char *a = NULL, *b = NULL;
525                         char *e;
526                         char ***x;
527
528                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
529
530                         e = strchr(optarg, ':');
531                         if (e) {
532                                 a = strndup(optarg, e - optarg);
533                                 b = strdup(e + 1);
534                         } else {
535                                 a = strdup(optarg);
536                                 b = strdup(optarg);
537                         }
538
539                         if (!a || !b)
540                                 return log_oom();
541
542                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
543                                 log_error("Invalid bind mount specification: %s", optarg);
544                                 return -EINVAL;
545                         }
546
547                         r = strv_extend(x, a);
548                         if (r < 0)
549                                 return log_oom();
550
551                         r = strv_extend(x, b);
552                         if (r < 0)
553                                 return log_oom();
554
555                         break;
556                 }
557
558                 case ARG_TMPFS: {
559                         _cleanup_free_ char *a = NULL, *b = NULL;
560                         char *e;
561
562                         e = strchr(optarg, ':');
563                         if (e) {
564                                 a = strndup(optarg, e - optarg);
565                                 b = strdup(e + 1);
566                         } else {
567                                 a = strdup(optarg);
568                                 b = strdup("mode=0755");
569                         }
570
571                         if (!a || !b)
572                                 return log_oom();
573
574                         if (!path_is_absolute(a)) {
575                                 log_error("Invalid tmpfs specification: %s", optarg);
576                                 return -EINVAL;
577                         }
578
579                         r = strv_push(&arg_tmpfs, a);
580                         if (r < 0)
581                                 return log_oom();
582
583                         a = NULL;
584
585                         r = strv_push(&arg_tmpfs, b);
586                         if (r < 0)
587                                 return log_oom();
588
589                         b = NULL;
590
591                         break;
592                 }
593
594                 case ARG_SETENV: {
595                         char **n;
596
597                         if (!env_assignment_is_valid(optarg)) {
598                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
599                                 return -EINVAL;
600                         }
601
602                         n = strv_env_set(arg_setenv, optarg);
603                         if (!n)
604                                 return log_oom();
605
606                         strv_free(arg_setenv);
607                         arg_setenv = n;
608                         break;
609                 }
610
611                 case 'q':
612                         arg_quiet = true;
613                         break;
614
615                 case ARG_SHARE_SYSTEM:
616                         arg_share_system = true;
617                         break;
618
619                 case ARG_REGISTER:
620                         r = parse_boolean(optarg);
621                         if (r < 0) {
622                                 log_error("Failed to parse --register= argument: %s", optarg);
623                                 return r;
624                         }
625
626                         arg_register = r;
627                         break;
628
629                 case ARG_KEEP_UNIT:
630                         arg_keep_unit = true;
631                         break;
632
633                 case ARG_PERSONALITY:
634
635                         arg_personality = personality_from_string(optarg);
636                         if (arg_personality == 0xffffffffLU) {
637                                 log_error("Unknown or unsupported personality '%s'.", optarg);
638                                 return -EINVAL;
639                         }
640
641                         break;
642
643                 case ARG_VOLATILE:
644
645                         if (!optarg)
646                                 arg_volatile = VOLATILE_YES;
647                         else {
648                                 r = parse_boolean(optarg);
649                                 if (r < 0) {
650                                         if (streq(optarg, "state"))
651                                                 arg_volatile = VOLATILE_STATE;
652                                         else {
653                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
654                                                 return r;
655                                         }
656                                 } else
657                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
658                         }
659
660                         break;
661
662                 case 'p': {
663                         const char *split, *e;
664                         uint16_t container_port, host_port;
665                         int protocol;
666                         ExposePort *p;
667
668                         if ((e = startswith(optarg, "tcp:")))
669                                 protocol = IPPROTO_TCP;
670                         else if ((e = startswith(optarg, "udp:")))
671                                 protocol = IPPROTO_UDP;
672                         else {
673                                 e = optarg;
674                                 protocol = IPPROTO_TCP;
675                         }
676
677                         split = strchr(e, ':');
678                         if (split) {
679                                 char v[split - e + 1];
680
681                                 memcpy(v, e, split - e);
682                                 v[split - e] = 0;
683
684                                 r = safe_atou16(v, &host_port);
685                                 if (r < 0 || host_port <= 0) {
686                                         log_error("Failed to parse host port: %s", optarg);
687                                         return -EINVAL;
688                                 }
689
690                                 r = safe_atou16(split + 1, &container_port);
691                         } else {
692                                 r = safe_atou16(e, &container_port);
693                                 host_port = container_port;
694                         }
695
696                         if (r < 0 || container_port <= 0) {
697                                 log_error("Failed to parse host port: %s", optarg);
698                                 return -EINVAL;
699                         }
700
701                         LIST_FOREACH(ports, p, arg_expose_ports) {
702                                 if (p->protocol == protocol && p->host_port == host_port) {
703                                         log_error("Duplicate port specification: %s", optarg);
704                                         return -EINVAL;
705                                 }
706                         }
707
708                         p = new(ExposePort, 1);
709                         if (!p)
710                                 return log_oom();
711
712                         p->protocol = protocol;
713                         p->host_port = host_port;
714                         p->container_port = container_port;
715
716                         LIST_PREPEND(ports, arg_expose_ports, p);
717
718                         break;
719                 }
720
721                 case '?':
722                         return -EINVAL;
723
724                 default:
725                         assert_not_reached("Unhandled option");
726                 }
727
728         if (arg_share_system)
729                 arg_register = false;
730
731         if (arg_boot && arg_share_system) {
732                 log_error("--boot and --share-system may not be combined.");
733                 return -EINVAL;
734         }
735
736         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
737                 log_error("--keep-unit may not be used when invoked from a user session.");
738                 return -EINVAL;
739         }
740
741         if (arg_directory && arg_image) {
742                 log_error("--directory= and --image= may not be combined.");
743                 return -EINVAL;
744         }
745
746         if (arg_template && arg_image) {
747                 log_error("--template= and --image= may not be combined.");
748                 return -EINVAL;
749         }
750
751         if (arg_template && !(arg_directory || arg_machine)) {
752                 log_error("--template= needs --directory= or --machine=.");
753                 return -EINVAL;
754         }
755
756         if (arg_ephemeral && arg_template) {
757                 log_error("--ephemeral and --template= may not be combined.");
758                 return -EINVAL;
759         }
760
761         if (arg_ephemeral && arg_image) {
762                 log_error("--ephemeral and --image= may not be combined.");
763                 return -EINVAL;
764         }
765
766         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
767                 log_error("--ephemeral and --link-journal= may not be combined.");
768                 return -EINVAL;
769         }
770
771         if (arg_volatile != VOLATILE_NO && arg_read_only) {
772                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
773                 return -EINVAL;
774         }
775
776         if (arg_expose_ports && !arg_private_network) {
777                 log_error("Cannot use --port= without private networking.");
778                 return -EINVAL;
779         }
780
781         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
782
783         return 1;
784 }
785
786 static int mount_all(const char *dest) {
787
788         typedef struct MountPoint {
789                 const char *what;
790                 const char *where;
791                 const char *type;
792                 const char *options;
793                 unsigned long flags;
794                 bool fatal;
795         } MountPoint;
796
797         static const MountPoint mount_table[] = {
798                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
799                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
800                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
801                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
802                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
803                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
804                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
805                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
806 #ifdef HAVE_SELINUX
807                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
808                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
809 #endif
810         };
811
812         unsigned k;
813         int r = 0;
814
815         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
816                 _cleanup_free_ char *where = NULL;
817 #ifdef HAVE_SELINUX
818                 _cleanup_free_ char *options = NULL;
819 #endif
820                 const char *o;
821                 int t;
822
823                 where = strjoin(dest, "/", mount_table[k].where, NULL);
824                 if (!where)
825                         return log_oom();
826
827                 t = path_is_mount_point(where, true);
828                 if (t < 0) {
829                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
830
831                         if (r == 0)
832                                 r = t;
833
834                         continue;
835                 }
836
837                 /* Skip this entry if it is not a remount. */
838                 if (mount_table[k].what && t > 0)
839                         continue;
840
841                 t = mkdir_p(where, 0755);
842                 if (t < 0) {
843                         if (mount_table[k].fatal) {
844                                log_error_errno(t, "Failed to create directory %s: %m", where);
845
846                                 if (r == 0)
847                                         r = t;
848                         } else
849                                log_warning_errno(t, "Failed to create directory %s: %m", where);
850
851                         continue;
852                 }
853
854 #ifdef HAVE_SELINUX
855                 if (arg_selinux_apifs_context &&
856                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
857                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
858                         if (!options)
859                                 return log_oom();
860
861                         o = options;
862                 } else
863 #endif
864                         o = mount_table[k].options;
865
866
867                 if (mount(mount_table[k].what,
868                           where,
869                           mount_table[k].type,
870                           mount_table[k].flags,
871                           o) < 0) {
872
873                         if (mount_table[k].fatal) {
874                                 log_error_errno(errno, "mount(%s) failed: %m", where);
875
876                                 if (r == 0)
877                                         r = -errno;
878                         } else
879                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
880                 }
881         }
882
883         return r;
884 }
885
886 static int mount_binds(const char *dest, char **l, bool ro) {
887         char **x, **y;
888
889         STRV_FOREACH_PAIR(x, y, l) {
890                 _cleanup_free_ char *where = NULL;
891                 struct stat source_st, dest_st;
892                 int r;
893
894                 if (stat(*x, &source_st) < 0)
895                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
896
897                 where = strappend(dest, *y);
898                 if (!where)
899                         return log_oom();
900
901                 r = stat(where, &dest_st);
902                 if (r == 0) {
903                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
904                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
905                                 return -EINVAL;
906                         }
907                 } else if (errno == ENOENT) {
908                         r = mkdir_parents_label(where, 0755);
909                         if (r < 0)
910                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
911                 } else {
912                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
913                         return -errno;
914                 }
915
916                 /* Create the mount point, but be conservative -- refuse to create block
917                  * and char devices. */
918                 if (S_ISDIR(source_st.st_mode)) {
919                         r = mkdir_label(where, 0755);
920                         if (r < 0 && errno != EEXIST)
921                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
922                 } else if (S_ISFIFO(source_st.st_mode)) {
923                         r = mkfifo(where, 0644);
924                         if (r < 0 && errno != EEXIST)
925                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
926                 } else if (S_ISSOCK(source_st.st_mode)) {
927                         r = mknod(where, 0644 | S_IFSOCK, 0);
928                         if (r < 0 && errno != EEXIST)
929                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
930                 } else if (S_ISREG(source_st.st_mode)) {
931                         r = touch(where);
932                         if (r < 0)
933                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
934                 } else {
935                         log_error("Refusing to create mountpoint for file: %s", *x);
936                         return -ENOTSUP;
937                 }
938
939                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
940                         return log_error_errno(errno, "mount(%s) failed: %m", where);
941
942                 if (ro) {
943                         r = bind_remount_recursive(where, true);
944                         if (r < 0)
945                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
946                 }
947         }
948
949         return 0;
950 }
951
952 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
953         char *to;
954         int r;
955
956         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
957
958         r = path_is_mount_point(to, false);
959         if (r < 0)
960                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
961         if (r > 0)
962                 return 0;
963
964         mkdir_p(to, 0755);
965
966         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
967                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
968
969         return 1;
970 }
971
972 static int mount_cgroup(const char *dest) {
973         _cleanup_set_free_free_ Set *controllers = NULL;
974         _cleanup_free_ char *own_cgroup_path = NULL;
975         const char *cgroup_root, *systemd_root, *systemd_own;
976         int r;
977
978         controllers = set_new(&string_hash_ops);
979         if (!controllers)
980                 return log_oom();
981
982         r = cg_kernel_controllers(controllers);
983         if (r < 0)
984                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
985
986         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
987         if (r < 0)
988                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
989
990         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
991         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
992                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
993
994         for (;;) {
995                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
996
997                 controller = set_steal_first(controllers);
998                 if (!controller)
999                         break;
1000
1001                 origin = strappend("/sys/fs/cgroup/", controller);
1002                 if (!origin)
1003                         return log_oom();
1004
1005                 r = readlink_malloc(origin, &combined);
1006                 if (r == -EINVAL) {
1007                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1008
1009                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1010                         if (r < 0)
1011                                 return r;
1012
1013                 } else if (r < 0)
1014                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1015                 else {
1016                         _cleanup_free_ char *target = NULL;
1017
1018                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1019                         if (!target)
1020                                 return log_oom();
1021
1022                         /* A symbolic link, a combination of controllers in one hierarchy */
1023
1024                         if (!filename_is_valid(combined)) {
1025                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1026                                 continue;
1027                         }
1028
1029                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1030                         if (r < 0)
1031                                 return r;
1032
1033                         if (symlink(combined, target) < 0)
1034                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1035                 }
1036         }
1037
1038         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1039         if (r < 0)
1040                 return r;
1041
1042         /* Make our own cgroup a (writable) bind mount */
1043         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1044         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1045                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1046
1047         /* And then remount the systemd cgroup root read-only */
1048         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1049         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1050                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1051
1052         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1053                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1054
1055         return 0;
1056 }
1057
1058 static int mount_tmpfs(const char *dest) {
1059         char **i, **o;
1060
1061         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1062                 _cleanup_free_ char *where = NULL;
1063                 int r;
1064
1065                 where = strappend(dest, *i);
1066                 if (!where)
1067                         return log_oom();
1068
1069                 r = mkdir_label(where, 0755);
1070                 if (r < 0 && r != -EEXIST)
1071                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1072
1073                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1074                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1075         }
1076
1077         return 0;
1078 }
1079
1080 static int setup_timezone(const char *dest) {
1081         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1082         char *z, *y;
1083         int r;
1084
1085         assert(dest);
1086
1087         /* Fix the timezone, if possible */
1088         r = readlink_malloc("/etc/localtime", &p);
1089         if (r < 0) {
1090                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1091                 return 0;
1092         }
1093
1094         z = path_startswith(p, "../usr/share/zoneinfo/");
1095         if (!z)
1096                 z = path_startswith(p, "/usr/share/zoneinfo/");
1097         if (!z) {
1098                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1099                 return 0;
1100         }
1101
1102         where = strappend(dest, "/etc/localtime");
1103         if (!where)
1104                 return log_oom();
1105
1106         r = readlink_malloc(where, &q);
1107         if (r >= 0) {
1108                 y = path_startswith(q, "../usr/share/zoneinfo/");
1109                 if (!y)
1110                         y = path_startswith(q, "/usr/share/zoneinfo/");
1111
1112                 /* Already pointing to the right place? Then do nothing .. */
1113                 if (y && streq(y, z))
1114                         return 0;
1115         }
1116
1117         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1118         if (!check)
1119                 return log_oom();
1120
1121         if (access(check, F_OK) < 0) {
1122                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1123                 return 0;
1124         }
1125
1126         what = strappend("../usr/share/zoneinfo/", z);
1127         if (!what)
1128                 return log_oom();
1129
1130         r = mkdir_parents(where, 0755);
1131         if (r < 0) {
1132                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1133
1134                 return 0;
1135         }
1136
1137         r = unlink(where);
1138         if (r < 0 && errno != ENOENT) {
1139                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1140
1141                 return 0;
1142         }
1143
1144         if (symlink(what, where) < 0) {
1145                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1146                 return 0;
1147         }
1148
1149         return 0;
1150 }
1151
1152 static int setup_resolv_conf(const char *dest) {
1153         _cleanup_free_ char *where = NULL;
1154         int r;
1155
1156         assert(dest);
1157
1158         if (arg_private_network)
1159                 return 0;
1160
1161         /* Fix resolv.conf, if possible */
1162         where = strappend(dest, "/etc/resolv.conf");
1163         if (!where)
1164                 return log_oom();
1165
1166         /* We don't really care for the results of this really. If it
1167          * fails, it fails, but meh... */
1168         r = mkdir_parents(where, 0755);
1169         if (r < 0) {
1170                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1171
1172                 return 0;
1173         }
1174
1175         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1176         if (r < 0) {
1177                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1178
1179                 return 0;
1180         }
1181
1182         return 0;
1183 }
1184
1185 static int setup_volatile_state(const char *directory) {
1186         const char *p;
1187         int r;
1188
1189         assert(directory);
1190
1191         if (arg_volatile != VOLATILE_STATE)
1192                 return 0;
1193
1194         /* --volatile=state means we simply overmount /var
1195            with a tmpfs, and the rest read-only. */
1196
1197         r = bind_remount_recursive(directory, true);
1198         if (r < 0)
1199                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1200
1201         p = strappenda(directory, "/var");
1202         r = mkdir(p, 0755);
1203         if (r < 0 && errno != EEXIST)
1204                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1205
1206         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1207                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1208
1209         return 0;
1210 }
1211
1212 static int setup_volatile(const char *directory) {
1213         bool tmpfs_mounted = false, bind_mounted = false;
1214         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1215         const char *f, *t;
1216         int r;
1217
1218         assert(directory);
1219
1220         if (arg_volatile != VOLATILE_YES)
1221                 return 0;
1222
1223         /* --volatile=yes means we mount a tmpfs to the root dir, and
1224            the original /usr to use inside it, and that read-only. */
1225
1226         if (!mkdtemp(template))
1227                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1228
1229         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1230                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1231                 r = -errno;
1232                 goto fail;
1233         }
1234
1235         tmpfs_mounted = true;
1236
1237         f = strappenda(directory, "/usr");
1238         t = strappenda(template, "/usr");
1239
1240         r = mkdir(t, 0755);
1241         if (r < 0 && errno != EEXIST) {
1242                 log_error_errno(errno, "Failed to create %s: %m", t);
1243                 r = -errno;
1244                 goto fail;
1245         }
1246
1247         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1248                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1249                 r = -errno;
1250                 goto fail;
1251         }
1252
1253         bind_mounted = true;
1254
1255         r = bind_remount_recursive(t, true);
1256         if (r < 0) {
1257                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1258                 goto fail;
1259         }
1260
1261         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1262                 log_error_errno(errno, "Failed to move root mount: %m");
1263                 r = -errno;
1264                 goto fail;
1265         }
1266
1267         rmdir(template);
1268
1269         return 0;
1270
1271 fail:
1272         if (bind_mounted)
1273                 umount(t);
1274         if (tmpfs_mounted)
1275                 umount(template);
1276         rmdir(template);
1277         return r;
1278 }
1279
1280 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1281
1282         snprintf(s, 37,
1283                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1284                  SD_ID128_FORMAT_VAL(id));
1285
1286         return s;
1287 }
1288
1289 static int setup_boot_id(const char *dest) {
1290         _cleanup_free_ char *from = NULL, *to = NULL;
1291         sd_id128_t rnd = {};
1292         char as_uuid[37];
1293         int r;
1294
1295         assert(dest);
1296
1297         if (arg_share_system)
1298                 return 0;
1299
1300         /* Generate a new randomized boot ID, so that each boot-up of
1301          * the container gets a new one */
1302
1303         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1304         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1305         if (!from || !to)
1306                 return log_oom();
1307
1308         r = sd_id128_randomize(&rnd);
1309         if (r < 0)
1310                 return log_error_errno(r, "Failed to generate random boot id: %m");
1311
1312         id128_format_as_uuid(rnd, as_uuid);
1313
1314         r = write_string_file(from, as_uuid);
1315         if (r < 0)
1316                 return log_error_errno(r, "Failed to write boot id: %m");
1317
1318         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1319                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1320                 r = -errno;
1321         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1322                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1323
1324         unlink(from);
1325         return r;
1326 }
1327
1328 static int copy_devnodes(const char *dest) {
1329
1330         static const char devnodes[] =
1331                 "null\0"
1332                 "zero\0"
1333                 "full\0"
1334                 "random\0"
1335                 "urandom\0"
1336                 "tty\0"
1337                 "net/tun\0";
1338
1339         const char *d;
1340         int r = 0;
1341         _cleanup_umask_ mode_t u;
1342
1343         assert(dest);
1344
1345         u = umask(0000);
1346
1347         NULSTR_FOREACH(d, devnodes) {
1348                 _cleanup_free_ char *from = NULL, *to = NULL;
1349                 struct stat st;
1350
1351                 from = strappend("/dev/", d);
1352                 to = strjoin(dest, "/dev/", d, NULL);
1353                 if (!from || !to)
1354                         return log_oom();
1355
1356                 if (stat(from, &st) < 0) {
1357
1358                         if (errno != ENOENT)
1359                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1360
1361                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1362
1363                         log_error("%s is not a char or block device, cannot copy", from);
1364                         return -EIO;
1365
1366                 } else {
1367                         r = mkdir_parents(to, 0775);
1368                         if (r < 0) {
1369                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1370                                 return -r;
1371                         }
1372
1373                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1374                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1375                 }
1376         }
1377
1378         return r;
1379 }
1380
1381 static int setup_ptmx(const char *dest) {
1382         _cleanup_free_ char *p = NULL;
1383
1384         p = strappend(dest, "/dev/ptmx");
1385         if (!p)
1386                 return log_oom();
1387
1388         if (symlink("pts/ptmx", p) < 0)
1389                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1390
1391         return 0;
1392 }
1393
1394 static int setup_dev_console(const char *dest, const char *console) {
1395         _cleanup_umask_ mode_t u;
1396         const char *to;
1397         struct stat st;
1398         int r;
1399
1400         assert(dest);
1401         assert(console);
1402
1403         u = umask(0000);
1404
1405         if (stat("/dev/null", &st) < 0)
1406                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1407
1408         r = chmod_and_chown(console, 0600, 0, 0);
1409         if (r < 0)
1410                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1411
1412         /* We need to bind mount the right tty to /dev/console since
1413          * ptys can only exist on pts file systems. To have something
1414          * to bind mount things on we create a device node first, and
1415          * use /dev/null for that since we the cgroups device policy
1416          * allows us to create that freely, while we cannot create
1417          * /dev/console. (Note that the major minor doesn't actually
1418          * matter here, since we mount it over anyway). */
1419
1420         to = strappenda(dest, "/dev/console");
1421         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1422                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1423
1424         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1425                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1426
1427         return 0;
1428 }
1429
1430 static int setup_kmsg(const char *dest, int kmsg_socket) {
1431         _cleanup_free_ char *from = NULL, *to = NULL;
1432         _cleanup_umask_ mode_t u;
1433         int r, fd, k;
1434         union {
1435                 struct cmsghdr cmsghdr;
1436                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1437         } control = {};
1438         struct msghdr mh = {
1439                 .msg_control = &control,
1440                 .msg_controllen = sizeof(control),
1441         };
1442         struct cmsghdr *cmsg;
1443
1444         assert(dest);
1445         assert(kmsg_socket >= 0);
1446
1447         u = umask(0000);
1448
1449         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1450          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1451          * on the reading side behave very similar to /proc/kmsg,
1452          * their writing side behaves differently from /dev/kmsg in
1453          * that writing blocks when nothing is reading. In order to
1454          * avoid any problems with containers deadlocking due to this
1455          * we simply make /dev/kmsg unavailable to the container. */
1456         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1457             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1458                 return log_oom();
1459
1460         if (mkfifo(from, 0600) < 0)
1461                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1462
1463         r = chmod_and_chown(from, 0600, 0, 0);
1464         if (r < 0)
1465                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1466
1467         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1468                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1469
1470         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1471         if (fd < 0)
1472                 return log_error_errno(errno, "Failed to open fifo: %m");
1473
1474         cmsg = CMSG_FIRSTHDR(&mh);
1475         cmsg->cmsg_level = SOL_SOCKET;
1476         cmsg->cmsg_type = SCM_RIGHTS;
1477         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1478         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1479
1480         mh.msg_controllen = cmsg->cmsg_len;
1481
1482         /* Store away the fd in the socket, so that it stays open as
1483          * long as we run the child */
1484         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1485         safe_close(fd);
1486
1487         if (k < 0)
1488                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1489
1490         /* And now make the FIFO unavailable as /dev/kmsg... */
1491         unlink(from);
1492         return 0;
1493 }
1494
1495 static int send_rtnl(int send_fd) {
1496         union {
1497                 struct cmsghdr cmsghdr;
1498                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1499         } control = {};
1500         struct msghdr mh = {
1501                 .msg_control = &control,
1502                 .msg_controllen = sizeof(control),
1503         };
1504         struct cmsghdr *cmsg;
1505         _cleanup_close_ int fd = -1;
1506         ssize_t k;
1507
1508         assert(send_fd >= 0);
1509
1510         if (!arg_expose_ports)
1511                 return 0;
1512
1513         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1514         if (fd < 0)
1515                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1516
1517         cmsg = CMSG_FIRSTHDR(&mh);
1518         cmsg->cmsg_level = SOL_SOCKET;
1519         cmsg->cmsg_type = SCM_RIGHTS;
1520         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1521         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1522
1523         mh.msg_controllen = cmsg->cmsg_len;
1524
1525         /* Store away the fd in the socket, so that it stays open as
1526          * long as we run the child */
1527         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1528         if (k < 0)
1529                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1530
1531         return 0;
1532 }
1533
1534 static int flush_ports(union in_addr_union *exposed) {
1535         ExposePort *p;
1536         int r, af = AF_INET;
1537
1538         assert(exposed);
1539
1540         if (!arg_expose_ports)
1541                 return 0;
1542
1543         if (in_addr_is_null(af, exposed))
1544                 return 0;
1545
1546         log_debug("Lost IP address.");
1547
1548         LIST_FOREACH(ports, p, arg_expose_ports) {
1549                 r = fw_add_local_dnat(false,
1550                                       af,
1551                                       p->protocol,
1552                                       NULL,
1553                                       NULL, 0,
1554                                       NULL, 0,
1555                                       p->host_port,
1556                                       exposed,
1557                                       p->container_port,
1558                                       NULL);
1559                 if (r < 0)
1560                         log_warning_errno(r, "Failed to modify firewall: %m");
1561         }
1562
1563         *exposed = IN_ADDR_NULL;
1564         return 0;
1565 }
1566
1567 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1568         _cleanup_free_ struct local_address *addresses = NULL;
1569         _cleanup_free_ char *pretty = NULL;
1570         union in_addr_union new_exposed;
1571         ExposePort *p;
1572         bool add;
1573         int af = AF_INET, r;
1574
1575         assert(exposed);
1576
1577         /* Invoked each time an address is added or removed inside the
1578          * container */
1579
1580         if (!arg_expose_ports)
1581                 return 0;
1582
1583         r = local_addresses(rtnl, 0, af, &addresses);
1584         if (r < 0)
1585                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1586
1587         add = r > 0 &&
1588                 addresses[0].family == af &&
1589                 addresses[0].scope < RT_SCOPE_LINK;
1590
1591         if (!add)
1592                 return flush_ports(exposed);
1593
1594         new_exposed = addresses[0].address;
1595         if (in_addr_equal(af, exposed, &new_exposed))
1596                 return 0;
1597
1598         in_addr_to_string(af, &new_exposed, &pretty);
1599         log_debug("New container IP is %s.", strna(pretty));
1600
1601         LIST_FOREACH(ports, p, arg_expose_ports) {
1602
1603                 r = fw_add_local_dnat(true,
1604                                       af,
1605                                       p->protocol,
1606                                       NULL,
1607                                       NULL, 0,
1608                                       NULL, 0,
1609                                       p->host_port,
1610                                       &new_exposed,
1611                                       p->container_port,
1612                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1613                 if (r < 0)
1614                         log_warning_errno(r, "Failed to modify firewall: %m");
1615         }
1616
1617         *exposed = new_exposed;
1618         return 0;
1619 }
1620
1621 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1622         union in_addr_union *exposed = userdata;
1623
1624         assert(rtnl);
1625         assert(m);
1626         assert(exposed);
1627
1628         expose_ports(rtnl, exposed);
1629         return 0;
1630 }
1631
1632 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1633         union {
1634                 struct cmsghdr cmsghdr;
1635                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1636         } control = {};
1637         struct msghdr mh = {
1638                 .msg_control = &control,
1639                 .msg_controllen = sizeof(control),
1640         };
1641         struct cmsghdr *cmsg;
1642         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1643         int fd, r;
1644         ssize_t k;
1645
1646         assert(event);
1647         assert(recv_fd >= 0);
1648         assert(ret);
1649
1650         if (!arg_expose_ports)
1651                 return 0;
1652
1653         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1654         if (k < 0)
1655                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1656
1657         cmsg = CMSG_FIRSTHDR(&mh);
1658         assert(cmsg->cmsg_level == SOL_SOCKET);
1659         assert(cmsg->cmsg_type == SCM_RIGHTS);
1660         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1661         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1662
1663         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1664         if (r < 0) {
1665                 safe_close(fd);
1666                 return log_error_errno(r, "Failed to create rtnl object: %m");
1667         }
1668
1669         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1670         if (r < 0)
1671                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1672
1673         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1674         if (r < 0)
1675                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1676
1677         r = sd_rtnl_attach_event(rtnl, event, 0);
1678         if (r < 0)
1679                 return log_error_errno(r, "Failed to add to even loop: %m");
1680
1681         *ret = rtnl;
1682         rtnl = NULL;
1683
1684         return 0;
1685 }
1686
1687 static int setup_hostname(void) {
1688
1689         if (arg_share_system)
1690                 return 0;
1691
1692         if (sethostname_idempotent(arg_machine) < 0)
1693                 return -errno;
1694
1695         return 0;
1696 }
1697
1698 static int setup_journal(const char *directory) {
1699         sd_id128_t machine_id, this_id;
1700         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1701         char *id;
1702         int r;
1703
1704         /* Don't link journals in ephemeral mode */
1705         if (arg_ephemeral)
1706                 return 0;
1707
1708         p = strappend(directory, "/etc/machine-id");
1709         if (!p)
1710                 return log_oom();
1711
1712         r = read_one_line_file(p, &b);
1713         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1714                 return 0;
1715         else if (r < 0)
1716                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1717
1718         id = strstrip(b);
1719         if (isempty(id) && arg_link_journal == LINK_AUTO)
1720                 return 0;
1721
1722         /* Verify validity */
1723         r = sd_id128_from_string(id, &machine_id);
1724         if (r < 0)
1725                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1726
1727         r = sd_id128_get_machine(&this_id);
1728         if (r < 0)
1729                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1730
1731         if (sd_id128_equal(machine_id, this_id)) {
1732                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1733                          "Host and machine ids are equal (%s): refusing to link journals", id);
1734                 if (arg_link_journal == LINK_AUTO)
1735                         return 0;
1736                 return -EEXIST;
1737         }
1738
1739         if (arg_link_journal == LINK_NO)
1740                 return 0;
1741
1742         free(p);
1743         p = strappend("/var/log/journal/", id);
1744         q = strjoin(directory, "/var/log/journal/", id, NULL);
1745         if (!p || !q)
1746                 return log_oom();
1747
1748         if (path_is_mount_point(p, false) > 0) {
1749                 if (arg_link_journal != LINK_AUTO) {
1750                         log_error("%s: already a mount point, refusing to use for journal", p);
1751                         return -EEXIST;
1752                 }
1753
1754                 return 0;
1755         }
1756
1757         if (path_is_mount_point(q, false) > 0) {
1758                 if (arg_link_journal != LINK_AUTO) {
1759                         log_error("%s: already a mount point, refusing to use for journal", q);
1760                         return -EEXIST;
1761                 }
1762
1763                 return 0;
1764         }
1765
1766         r = readlink_and_make_absolute(p, &d);
1767         if (r >= 0) {
1768                 if ((arg_link_journal == LINK_GUEST ||
1769                      arg_link_journal == LINK_AUTO) &&
1770                     path_equal(d, q)) {
1771
1772                         r = mkdir_p(q, 0755);
1773                         if (r < 0)
1774                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1775                         return 0;
1776                 }
1777
1778                 if (unlink(p) < 0)
1779                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1780         } else if (r == -EINVAL) {
1781
1782                 if (arg_link_journal == LINK_GUEST &&
1783                     rmdir(p) < 0) {
1784
1785                         if (errno == ENOTDIR) {
1786                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1787                                 return r;
1788                         } else {
1789                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1790                                 return -errno;
1791                         }
1792                 }
1793         } else if (r != -ENOENT) {
1794                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1795                 return r;
1796         }
1797
1798         if (arg_link_journal == LINK_GUEST) {
1799
1800                 if (symlink(q, p) < 0) {
1801                         if (arg_link_journal_try) {
1802                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1803                                 return 0;
1804                         } else {
1805                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1806                                 return -errno;
1807                         }
1808                 }
1809
1810                 r = mkdir_p(q, 0755);
1811                 if (r < 0)
1812                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1813                 return 0;
1814         }
1815
1816         if (arg_link_journal == LINK_HOST) {
1817                 /* don't create parents here -- if the host doesn't have
1818                  * permanent journal set up, don't force it here */
1819                 r = mkdir(p, 0755);
1820                 if (r < 0) {
1821                         if (arg_link_journal_try) {
1822                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1823                                 return 0;
1824                         } else {
1825                                 log_error_errno(errno, "Failed to create %s: %m", p);
1826                                 return r;
1827                         }
1828                 }
1829
1830         } else if (access(p, F_OK) < 0)
1831                 return 0;
1832
1833         if (dir_is_empty(q) == 0)
1834                 log_warning("%s is not empty, proceeding anyway.", q);
1835
1836         r = mkdir_p(q, 0755);
1837         if (r < 0) {
1838                 log_error_errno(errno, "Failed to create %s: %m", q);
1839                 return r;
1840         }
1841
1842         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1843                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1844
1845         return 0;
1846 }
1847
1848 static int drop_capabilities(void) {
1849         return capability_bounding_set_drop(~arg_retain, false);
1850 }
1851
1852 static int register_machine(pid_t pid, int local_ifindex) {
1853         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1854         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1855         int r;
1856
1857         if (!arg_register)
1858                 return 0;
1859
1860         r = sd_bus_default_system(&bus);
1861         if (r < 0)
1862                 return log_error_errno(r, "Failed to open system bus: %m");
1863
1864         if (arg_keep_unit) {
1865                 r = sd_bus_call_method(
1866                                 bus,
1867                                 "org.freedesktop.machine1",
1868                                 "/org/freedesktop/machine1",
1869                                 "org.freedesktop.machine1.Manager",
1870                                 "RegisterMachineWithNetwork",
1871                                 &error,
1872                                 NULL,
1873                                 "sayssusai",
1874                                 arg_machine,
1875                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1876                                 "nspawn",
1877                                 "container",
1878                                 (uint32_t) pid,
1879                                 strempty(arg_directory),
1880                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1881         } else {
1882                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1883
1884                 r = sd_bus_message_new_method_call(
1885                                 bus,
1886                                 &m,
1887                                 "org.freedesktop.machine1",
1888                                 "/org/freedesktop/machine1",
1889                                 "org.freedesktop.machine1.Manager",
1890                                 "CreateMachineWithNetwork");
1891                 if (r < 0)
1892                         return log_error_errno(r, "Failed to create message: %m");
1893
1894                 r = sd_bus_message_append(
1895                                 m,
1896                                 "sayssusai",
1897                                 arg_machine,
1898                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1899                                 "nspawn",
1900                                 "container",
1901                                 (uint32_t) pid,
1902                                 strempty(arg_directory),
1903                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1904                 if (r < 0)
1905                         return log_error_errno(r, "Failed to append message arguments: %m");
1906
1907                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1908                 if (r < 0)
1909                         return log_error_errno(r, "Failed to open container: %m");
1910
1911                 if (!isempty(arg_slice)) {
1912                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1913                         if (r < 0)
1914                                 return log_error_errno(r, "Failed to append slice: %m");
1915                 }
1916
1917                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1918                 if (r < 0)
1919                         return log_error_errno(r, "Failed to add device policy: %m");
1920
1921                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1922                                           /* Allow the container to
1923                                            * access and create the API
1924                                            * device nodes, so that
1925                                            * PrivateDevices= in the
1926                                            * container can work
1927                                            * fine */
1928                                           "/dev/null", "rwm",
1929                                           "/dev/zero", "rwm",
1930                                           "/dev/full", "rwm",
1931                                           "/dev/random", "rwm",
1932                                           "/dev/urandom", "rwm",
1933                                           "/dev/tty", "rwm",
1934                                           "/dev/net/tun", "rwm",
1935                                           /* Allow the container
1936                                            * access to ptys. However,
1937                                            * do not permit the
1938                                            * container to ever create
1939                                            * these device nodes. */
1940                                           "/dev/pts/ptmx", "rw",
1941                                           "char-pts", "rw");
1942                 if (r < 0)
1943                         return log_error_errno(r, "Failed to add device whitelist: %m");
1944
1945                 r = sd_bus_message_close_container(m);
1946                 if (r < 0)
1947                         return log_error_errno(r, "Failed to close container: %m");
1948
1949                 r = sd_bus_call(bus, m, 0, &error, NULL);
1950         }
1951
1952         if (r < 0) {
1953                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1954                 return r;
1955         }
1956
1957         return 0;
1958 }
1959
1960 static int terminate_machine(pid_t pid) {
1961         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1962         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1963         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1964         const char *path;
1965         int r;
1966
1967         if (!arg_register)
1968                 return 0;
1969
1970         r = sd_bus_default_system(&bus);
1971         if (r < 0)
1972                 return log_error_errno(r, "Failed to open system bus: %m");
1973
1974         r = sd_bus_call_method(
1975                         bus,
1976                         "org.freedesktop.machine1",
1977                         "/org/freedesktop/machine1",
1978                         "org.freedesktop.machine1.Manager",
1979                         "GetMachineByPID",
1980                         &error,
1981                         &reply,
1982                         "u",
1983                         (uint32_t) pid);
1984         if (r < 0) {
1985                 /* Note that the machine might already have been
1986                  * cleaned up automatically, hence don't consider it a
1987                  * failure if we cannot get the machine object. */
1988                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1989                 return 0;
1990         }
1991
1992         r = sd_bus_message_read(reply, "o", &path);
1993         if (r < 0)
1994                 return bus_log_parse_error(r);
1995
1996         r = sd_bus_call_method(
1997                         bus,
1998                         "org.freedesktop.machine1",
1999                         path,
2000                         "org.freedesktop.machine1.Machine",
2001                         "Terminate",
2002                         &error,
2003                         NULL,
2004                         NULL);
2005         if (r < 0) {
2006                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2007                 return 0;
2008         }
2009
2010         return 0;
2011 }
2012
2013 static int reset_audit_loginuid(void) {
2014         _cleanup_free_ char *p = NULL;
2015         int r;
2016
2017         if (arg_share_system)
2018                 return 0;
2019
2020         r = read_one_line_file("/proc/self/loginuid", &p);
2021         if (r == -ENOENT)
2022                 return 0;
2023         if (r < 0)
2024                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2025
2026         /* Already reset? */
2027         if (streq(p, "4294967295"))
2028                 return 0;
2029
2030         r = write_string_file("/proc/self/loginuid", "4294967295");
2031         if (r < 0) {
2032                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2033                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2034                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2035                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2036                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2037
2038                 sleep(5);
2039         }
2040
2041         return 0;
2042 }
2043
2044 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2045 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2046 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2047
2048 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2049         uint8_t result[8];
2050         size_t l, sz;
2051         uint8_t *v, *i;
2052         int r;
2053
2054         l = strlen(arg_machine);
2055         sz = sizeof(sd_id128_t) + l;
2056         if (idx > 0)
2057                 sz += sizeof(idx);
2058
2059         v = alloca(sz);
2060
2061         /* fetch some persistent data unique to the host */
2062         r = sd_id128_get_machine((sd_id128_t*) v);
2063         if (r < 0)
2064                 return r;
2065
2066         /* combine with some data unique (on this host) to this
2067          * container instance */
2068         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2069         if (idx > 0) {
2070                 idx = htole64(idx);
2071                 memcpy(i, &idx, sizeof(idx));
2072         }
2073
2074         /* Let's hash the host machine ID plus the container name. We
2075          * use a fixed, but originally randomly created hash key here. */
2076         siphash24(result, v, sz, hash_key.bytes);
2077
2078         assert_cc(ETH_ALEN <= sizeof(result));
2079         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2080
2081         /* see eth_random_addr in the kernel */
2082         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2083         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2084
2085         return 0;
2086 }
2087
2088 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2089         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2090         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2091         struct ether_addr mac_host, mac_container;
2092         int r, i;
2093
2094         if (!arg_private_network)
2095                 return 0;
2096
2097         if (!arg_network_veth)
2098                 return 0;
2099
2100         /* Use two different interface name prefixes depending whether
2101          * we are in bridge mode or not. */
2102         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2103                  arg_network_bridge ? "vb" : "ve", arg_machine);
2104
2105         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2106         if (r < 0)
2107                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2108
2109         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2110         if (r < 0)
2111                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2112
2113         r = sd_rtnl_open(&rtnl, 0);
2114         if (r < 0)
2115                 return log_error_errno(r, "Failed to connect to netlink: %m");
2116
2117         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2120
2121         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2122         if (r < 0)
2123                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2124
2125         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2126         if (r < 0)
2127                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2128
2129         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2130         if (r < 0)
2131                 return log_error_errno(r, "Failed to open netlink container: %m");
2132
2133         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2134         if (r < 0)
2135                 return log_error_errno(r, "Failed to open netlink container: %m");
2136
2137         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2138         if (r < 0)
2139                 return log_error_errno(r, "Failed to open netlink container: %m");
2140
2141         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2142         if (r < 0)
2143                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2144
2145         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2146         if (r < 0)
2147                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2148
2149         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2150         if (r < 0)
2151                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2152
2153         r = sd_rtnl_message_close_container(m);
2154         if (r < 0)
2155                 return log_error_errno(r, "Failed to close netlink container: %m");
2156
2157         r = sd_rtnl_message_close_container(m);
2158         if (r < 0)
2159                 return log_error_errno(r, "Failed to close netlink container: %m");
2160
2161         r = sd_rtnl_message_close_container(m);
2162         if (r < 0)
2163                 return log_error_errno(r, "Failed to close netlink container: %m");
2164
2165         r = sd_rtnl_call(rtnl, m, 0, NULL);
2166         if (r < 0)
2167                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2168
2169         i = (int) if_nametoindex(iface_name);
2170         if (i <= 0)
2171                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2172
2173         *ifi = i;
2174
2175         return 0;
2176 }
2177
2178 static int setup_bridge(const char veth_name[], int *ifi) {
2179         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2180         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2181         int r, bridge;
2182
2183         if (!arg_private_network)
2184                 return 0;
2185
2186         if (!arg_network_veth)
2187                 return 0;
2188
2189         if (!arg_network_bridge)
2190                 return 0;
2191
2192         bridge = (int) if_nametoindex(arg_network_bridge);
2193         if (bridge <= 0)
2194                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2195
2196         *ifi = bridge;
2197
2198         r = sd_rtnl_open(&rtnl, 0);
2199         if (r < 0)
2200                 return log_error_errno(r, "Failed to connect to netlink: %m");
2201
2202         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2203         if (r < 0)
2204                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2205
2206         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2207         if (r < 0)
2208                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2209
2210         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2211         if (r < 0)
2212                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2213
2214         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2215         if (r < 0)
2216                 return log_error_errno(r, "Failed to add netlink master field: %m");
2217
2218         r = sd_rtnl_call(rtnl, m, 0, NULL);
2219         if (r < 0)
2220                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2221
2222         return 0;
2223 }
2224
2225 static int parse_interface(struct udev *udev, const char *name) {
2226         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2227         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2228         int ifi;
2229
2230         ifi = (int) if_nametoindex(name);
2231         if (ifi <= 0)
2232                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2233
2234         sprintf(ifi_str, "n%i", ifi);
2235         d = udev_device_new_from_device_id(udev, ifi_str);
2236         if (!d)
2237                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2238
2239         if (udev_device_get_is_initialized(d) <= 0) {
2240                 log_error("Network interface %s is not initialized yet.", name);
2241                 return -EBUSY;
2242         }
2243
2244         return ifi;
2245 }
2246
2247 static int move_network_interfaces(pid_t pid) {
2248         _cleanup_udev_unref_ struct udev *udev = NULL;
2249         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2250         char **i;
2251         int r;
2252
2253         if (!arg_private_network)
2254                 return 0;
2255
2256         if (strv_isempty(arg_network_interfaces))
2257                 return 0;
2258
2259         r = sd_rtnl_open(&rtnl, 0);
2260         if (r < 0)
2261                 return log_error_errno(r, "Failed to connect to netlink: %m");
2262
2263         udev = udev_new();
2264         if (!udev) {
2265                 log_error("Failed to connect to udev.");
2266                 return -ENOMEM;
2267         }
2268
2269         STRV_FOREACH(i, arg_network_interfaces) {
2270                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2271                 int ifi;
2272
2273                 ifi = parse_interface(udev, *i);
2274                 if (ifi < 0)
2275                         return ifi;
2276
2277                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2278                 if (r < 0)
2279                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2280
2281                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2282                 if (r < 0)
2283                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2284
2285                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2286                 if (r < 0)
2287                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2288         }
2289
2290         return 0;
2291 }
2292
2293 static int setup_macvlan(pid_t pid) {
2294         _cleanup_udev_unref_ struct udev *udev = NULL;
2295         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2296         unsigned idx = 0;
2297         char **i;
2298         int r;
2299
2300         if (!arg_private_network)
2301                 return 0;
2302
2303         if (strv_isempty(arg_network_macvlan))
2304                 return 0;
2305
2306         r = sd_rtnl_open(&rtnl, 0);
2307         if (r < 0)
2308                 return log_error_errno(r, "Failed to connect to netlink: %m");
2309
2310         udev = udev_new();
2311         if (!udev) {
2312                 log_error("Failed to connect to udev.");
2313                 return -ENOMEM;
2314         }
2315
2316         STRV_FOREACH(i, arg_network_macvlan) {
2317                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2318                 _cleanup_free_ char *n = NULL;
2319                 struct ether_addr mac;
2320                 int ifi;
2321
2322                 ifi = parse_interface(udev, *i);
2323                 if (ifi < 0)
2324                         return ifi;
2325
2326                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2327                 if (r < 0)
2328                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2329
2330                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2331                 if (r < 0)
2332                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2333
2334                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2335                 if (r < 0)
2336                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2337
2338                 n = strappend("mv-", *i);
2339                 if (!n)
2340                         return log_oom();
2341
2342                 strshorten(n, IFNAMSIZ-1);
2343
2344                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2345                 if (r < 0)
2346                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2347
2348                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2349                 if (r < 0)
2350                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2351
2352                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2353                 if (r < 0)
2354                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2355
2356                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2357                 if (r < 0)
2358                         return log_error_errno(r, "Failed to open netlink container: %m");
2359
2360                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2361                 if (r < 0)
2362                         return log_error_errno(r, "Failed to open netlink container: %m");
2363
2364                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2365                 if (r < 0)
2366                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2367
2368                 r = sd_rtnl_message_close_container(m);
2369                 if (r < 0)
2370                         return log_error_errno(r, "Failed to close netlink container: %m");
2371
2372                 r = sd_rtnl_message_close_container(m);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to close netlink container: %m");
2375
2376                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2379         }
2380
2381         return 0;
2382 }
2383
2384 static int setup_seccomp(void) {
2385
2386 #ifdef HAVE_SECCOMP
2387         static const int blacklist[] = {
2388                 SCMP_SYS(kexec_load),
2389                 SCMP_SYS(open_by_handle_at),
2390                 SCMP_SYS(init_module),
2391                 SCMP_SYS(finit_module),
2392                 SCMP_SYS(delete_module),
2393                 SCMP_SYS(iopl),
2394                 SCMP_SYS(ioperm),
2395                 SCMP_SYS(swapon),
2396                 SCMP_SYS(swapoff),
2397         };
2398
2399         scmp_filter_ctx seccomp;
2400         unsigned i;
2401         int r;
2402
2403         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2404         if (!seccomp)
2405                 return log_oom();
2406
2407         r = seccomp_add_secondary_archs(seccomp);
2408         if (r < 0) {
2409                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2410                 goto finish;
2411         }
2412
2413         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2414                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2415                 if (r == -EFAULT)
2416                         continue; /* unknown syscall */
2417                 if (r < 0) {
2418                         log_error_errno(r, "Failed to block syscall: %m");
2419                         goto finish;
2420                 }
2421         }
2422
2423         /*
2424            Audit is broken in containers, much of the userspace audit
2425            hookup will fail if running inside a container. We don't
2426            care and just turn off creation of audit sockets.
2427
2428            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2429            with EAFNOSUPPORT which audit userspace uses as indication
2430            that audit is disabled in the kernel.
2431          */
2432
2433         r = seccomp_rule_add(
2434                         seccomp,
2435                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2436                         SCMP_SYS(socket),
2437                         2,
2438                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2439                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2440         if (r < 0) {
2441                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2442                 goto finish;
2443         }
2444
2445         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2446         if (r < 0) {
2447                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2448                 goto finish;
2449         }
2450
2451         r = seccomp_load(seccomp);
2452         if (r < 0)
2453                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2454
2455 finish:
2456         seccomp_release(seccomp);
2457         return r;
2458 #else
2459         return 0;
2460 #endif
2461
2462 }
2463
2464 static int setup_propagate(const char *root) {
2465         const char *p, *q;
2466
2467         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2468         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2469         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2470         (void) mkdir_p(p, 0600);
2471
2472         q = strappenda(root, "/run/systemd/nspawn/incoming");
2473         mkdir_parents(q, 0755);
2474         mkdir_p(q, 0600);
2475
2476         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2477                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2478
2479         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2480                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2481
2482         return 0;
2483 }
2484
2485 static int setup_image(char **device_path, int *loop_nr) {
2486         struct loop_info64 info = {
2487                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2488         };
2489         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2490         _cleanup_free_ char* loopdev = NULL;
2491         struct stat st;
2492         int r, nr;
2493
2494         assert(device_path);
2495         assert(loop_nr);
2496         assert(arg_image);
2497
2498         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2499         if (fd < 0)
2500                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2501
2502         if (fstat(fd, &st) < 0)
2503                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2504
2505         if (S_ISBLK(st.st_mode)) {
2506                 char *p;
2507
2508                 p = strdup(arg_image);
2509                 if (!p)
2510                         return log_oom();
2511
2512                 *device_path = p;
2513
2514                 *loop_nr = -1;
2515
2516                 r = fd;
2517                 fd = -1;
2518
2519                 return r;
2520         }
2521
2522         if (!S_ISREG(st.st_mode)) {
2523                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2524                 return -EINVAL;
2525         }
2526
2527         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2528         if (control < 0)
2529                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2530
2531         nr = ioctl(control, LOOP_CTL_GET_FREE);
2532         if (nr < 0)
2533                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2534
2535         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2536                 return log_oom();
2537
2538         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2539         if (loop < 0)
2540                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2541
2542         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2543                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2544
2545         if (arg_read_only)
2546                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2547
2548         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2549                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2550
2551         *device_path = loopdev;
2552         loopdev = NULL;
2553
2554         *loop_nr = nr;
2555
2556         r = loop;
2557         loop = -1;
2558
2559         return r;
2560 }
2561
2562 static int wait_for_block_device(struct udev *udev, dev_t devnum, struct udev_device **ret) {
2563         _cleanup_udev_monitor_unref_ struct udev_monitor *monitor = NULL;
2564         int r;
2565
2566         assert(udev);
2567         assert(ret);
2568
2569         for (;;) {
2570                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2571                 struct pollfd pfd = {
2572                         .events = POLLIN
2573                 };
2574
2575                 d = udev_device_new_from_devnum(udev, 'b', devnum);
2576                 if (!d)
2577                         return log_oom();
2578
2579                 r = udev_device_get_is_initialized(d);
2580                 if (r < 0)
2581                         return log_error_errno(r, "Failed to check if device is initialized: %m");
2582                 if (r > 0) {
2583                         *ret = d;
2584                         d = NULL;
2585                         return 0;
2586                 }
2587                 d = udev_device_unref(d);
2588
2589                 if (!monitor) {
2590                         monitor = udev_monitor_new_from_netlink(udev, "udev");
2591                         if (!monitor)
2592                                 return log_oom();
2593
2594                         r = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block", NULL);
2595                         if (r < 0)
2596                                 return log_error_errno(r, "Failed to add block match: %m");
2597
2598                         r = udev_monitor_enable_receiving(monitor);
2599                         if (r < 0)
2600                                 return log_error_errno(r, "Failed to turn on monitor: %m");
2601
2602                         continue;
2603                 }
2604
2605                 pfd.fd = udev_monitor_get_fd(monitor);
2606                 if (pfd.fd < 0)
2607                         return log_error_errno(r, "Failed to get udev monitor fd: %m");
2608
2609                 r = poll(&pfd, 1, -1);
2610                 if (r < 0)
2611                         return log_error_errno(errno, "Failed to wait for device initialization: %m");
2612
2613                 d = udev_monitor_receive_device(monitor);
2614         }
2615
2616         return 0;
2617 }
2618
2619 #define PARTITION_TABLE_BLURB \
2620         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2621         "type 0x83 that is marked bootable, or follow\n" \
2622         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2623         "to be bootable with systemd-nspawn."
2624
2625 static int dissect_image(
2626                 int fd,
2627                 char **root_device, bool *root_device_rw,
2628                 char **home_device, bool *home_device_rw,
2629                 char **srv_device, bool *srv_device_rw,
2630                 bool *secondary) {
2631
2632 #ifdef HAVE_BLKID
2633         int home_nr = -1, srv_nr = -1;
2634 #ifdef GPT_ROOT_NATIVE
2635         int root_nr = -1;
2636 #endif
2637 #ifdef GPT_ROOT_SECONDARY
2638         int secondary_root_nr = -1;
2639 #endif
2640
2641         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2642         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2643         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2644         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2645         _cleanup_udev_unref_ struct udev *udev = NULL;
2646         struct udev_list_entry *first, *item;
2647         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2648         const char *pttype = NULL;
2649         blkid_partlist pl;
2650         struct stat st;
2651         int r;
2652         bool is_gpt, is_mbr;
2653
2654         assert(fd >= 0);
2655         assert(root_device);
2656         assert(home_device);
2657         assert(srv_device);
2658         assert(secondary);
2659         assert(arg_image);
2660
2661         b = blkid_new_probe();
2662         if (!b)
2663                 return log_oom();
2664
2665         errno = 0;
2666         r = blkid_probe_set_device(b, fd, 0, 0);
2667         if (r != 0) {
2668                 if (errno == 0)
2669                         return log_oom();
2670
2671                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2672                 return -errno;
2673         }
2674
2675         blkid_probe_enable_partitions(b, 1);
2676         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2677
2678         errno = 0;
2679         r = blkid_do_safeprobe(b);
2680         if (r == -2 || r == 1) {
2681                 log_error("Failed to identify any partition table on\n"
2682                           "    %s\n"
2683                           PARTITION_TABLE_BLURB, arg_image);
2684                 return -EINVAL;
2685         } else if (r != 0) {
2686                 if (errno == 0)
2687                         errno = EIO;
2688                 log_error_errno(errno, "Failed to probe: %m");
2689                 return -errno;
2690         }
2691
2692         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2693
2694         is_gpt = streq_ptr(pttype, "gpt");
2695         is_mbr = streq_ptr(pttype, "dos");
2696
2697         if (!is_gpt && !is_mbr) {
2698                 log_error("No GPT or MBR partition table discovered on\n"
2699                           "    %s\n"
2700                           PARTITION_TABLE_BLURB, arg_image);
2701                 return -EINVAL;
2702         }
2703
2704         errno = 0;
2705         pl = blkid_probe_get_partitions(b);
2706         if (!pl) {
2707                 if (errno == 0)
2708                         return log_oom();
2709
2710                 log_error("Failed to list partitions of %s", arg_image);
2711                 return -errno;
2712         }
2713
2714         udev = udev_new();
2715         if (!udev)
2716                 return log_oom();
2717
2718         if (fstat(fd, &st) < 0)
2719                 return log_error_errno(errno, "Failed to stat block device: %m");
2720
2721         r = wait_for_block_device(udev, st.st_rdev, &d);
2722         if (r < 0)
2723                 return r;
2724
2725         e = udev_enumerate_new(udev);
2726         if (!e)
2727                 return log_oom();
2728
2729         r = udev_enumerate_add_match_parent(e, d);
2730         if (r < 0)
2731                 return log_oom();
2732
2733         r = udev_enumerate_scan_devices(e);
2734         if (r < 0)
2735                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2736
2737         first = udev_enumerate_get_list_entry(e);
2738         udev_list_entry_foreach(item, first) {
2739                 _cleanup_udev_device_unref_ struct udev_device *q;
2740                 const char *node;
2741                 unsigned long long flags;
2742                 blkid_partition pp;
2743                 dev_t qn;
2744                 int nr;
2745
2746                 errno = 0;
2747                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2748                 if (!q) {
2749                         if (!errno)
2750                                 errno = ENOMEM;
2751
2752                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2753                         return -errno;
2754                 }
2755
2756                 qn = udev_device_get_devnum(q);
2757                 if (major(qn) == 0)
2758                         continue;
2759
2760                 if (st.st_rdev == qn)
2761                         continue;
2762
2763                 node = udev_device_get_devnode(q);
2764                 if (!node)
2765                         continue;
2766
2767                 pp = blkid_partlist_devno_to_partition(pl, qn);
2768                 if (!pp)
2769                         continue;
2770
2771                 flags = blkid_partition_get_flags(pp);
2772                 if (is_gpt && (flags & GPT_FLAG_NO_AUTO))
2773                         continue;
2774                 if (is_mbr && (flags != 0x80)) /* Bootable flag */
2775                         continue;
2776
2777                 nr = blkid_partition_get_partno(pp);
2778                 if (nr < 0)
2779                         continue;
2780
2781                 if (is_gpt) {
2782                         sd_id128_t type_id;
2783                         const char *stype;
2784
2785                         stype = blkid_partition_get_type_string(pp);
2786                         if (!stype)
2787                                 continue;
2788
2789                         if (sd_id128_from_string(stype, &type_id) < 0)
2790                                 continue;
2791
2792                         if (sd_id128_equal(type_id, GPT_HOME)) {
2793
2794                                 if (home && nr >= home_nr)
2795                                         continue;
2796
2797                                 home_nr = nr;
2798                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2799
2800                                 r = free_and_strdup(&home, node);
2801                                 if (r < 0)
2802                                         return log_oom();
2803
2804                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2805
2806                                 if (srv && nr >= srv_nr)
2807                                         continue;
2808
2809                                 srv_nr = nr;
2810                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2811
2812                                 r = free_and_strdup(&srv, node);
2813                                 if (r < 0)
2814                                         return log_oom();
2815                         }
2816 #ifdef GPT_ROOT_NATIVE
2817                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2818
2819                                 if (root && nr >= root_nr)
2820                                         continue;
2821
2822                                 root_nr = nr;
2823                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2824
2825                                 r = free_and_strdup(&root, node);
2826                                 if (r < 0)
2827                                         return log_oom();
2828                         }
2829 #endif
2830 #ifdef GPT_ROOT_SECONDARY
2831                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2832
2833                                 if (secondary_root && nr >= secondary_root_nr)
2834                                         continue;
2835
2836                                 secondary_root_nr = nr;
2837                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2838
2839                                 r = free_and_strdup(&secondary_root, node);
2840                                 if (r < 0)
2841                                         return log_oom();
2842                         }
2843 #endif
2844
2845                 } else if (is_mbr) {
2846                         int type;
2847
2848                         type = blkid_partition_get_type(pp);
2849                         if (type != 0x83) /* Linux partition */
2850                                 continue;
2851
2852                         /* Note that there's a certain, intended
2853                          * asymmetry here: while for GPT we simply
2854                          * take the first valid partition and ignore
2855                          * all others of the same type, for MBR we
2856                          * fail if there are multiple suitable
2857                          * partitions. This is because the GPT
2858                          * partition types are defined by us, and
2859                          * hence we can define their lookup semantics,
2860                          * while for the MBR logic we reuse existing
2861                          * definitions, and simply don't want to make
2862                          * out the situation. */
2863
2864                         if (root) {
2865                                 log_error("Identified multiple bootable Linux 0x83 partitions on\n"
2866                                           "    %s\n"
2867                                           PARTITION_TABLE_BLURB, arg_image);
2868                                 return -EINVAL;
2869                         }
2870
2871                         root_nr = nr;
2872
2873                         r = free_and_strdup(&root, node);
2874                         if (r < 0)
2875                                 return log_oom();
2876                 }
2877         }
2878
2879         if (!root && !secondary_root) {
2880                 log_error("Failed to identify root partition in disk image\n"
2881                           "    %s\n"
2882                           PARTITION_TABLE_BLURB, arg_image);
2883                 return -EINVAL;
2884         }
2885
2886         if (root) {
2887                 *root_device = root;
2888                 root = NULL;
2889
2890                 *root_device_rw = root_rw;
2891                 *secondary = false;
2892         } else if (secondary_root) {
2893                 *root_device = secondary_root;
2894                 secondary_root = NULL;
2895
2896                 *root_device_rw = secondary_root_rw;
2897                 *secondary = true;
2898         }
2899
2900         if (home) {
2901                 *home_device = home;
2902                 home = NULL;
2903
2904                 *home_device_rw = home_rw;
2905         }
2906
2907         if (srv) {
2908                 *srv_device = srv;
2909                 srv = NULL;
2910
2911                 *srv_device_rw = srv_rw;
2912         }
2913
2914         return 0;
2915 #else
2916         log_error("--image= is not supported, compiled without blkid support.");
2917         return -ENOTSUP;
2918 #endif
2919 }
2920
2921 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2922 #ifdef HAVE_BLKID
2923         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2924         const char *fstype, *p;
2925         int r;
2926
2927         assert(what);
2928         assert(where);
2929
2930         if (arg_read_only)
2931                 rw = false;
2932
2933         if (directory)
2934                 p = strappenda(where, directory);
2935         else
2936                 p = where;
2937
2938         errno = 0;
2939         b = blkid_new_probe_from_filename(what);
2940         if (!b) {
2941                 if (errno == 0)
2942                         return log_oom();
2943                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2944                 return -errno;
2945         }
2946
2947         blkid_probe_enable_superblocks(b, 1);
2948         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2949
2950         errno = 0;
2951         r = blkid_do_safeprobe(b);
2952         if (r == -1 || r == 1) {
2953                 log_error("Cannot determine file system type of %s", what);
2954                 return -EINVAL;
2955         } else if (r != 0) {
2956                 if (errno == 0)
2957                         errno = EIO;
2958                 log_error_errno(errno, "Failed to probe %s: %m", what);
2959                 return -errno;
2960         }
2961
2962         errno = 0;
2963         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2964                 if (errno == 0)
2965                         errno = EINVAL;
2966                 log_error("Failed to determine file system type of %s", what);
2967                 return -errno;
2968         }
2969
2970         if (streq(fstype, "crypto_LUKS")) {
2971                 log_error("nspawn currently does not support LUKS disk images.");
2972                 return -ENOTSUP;
2973         }
2974
2975         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2976                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2977
2978         return 0;
2979 #else
2980         log_error("--image= is not supported, compiled without blkid support.");
2981         return -ENOTSUP;
2982 #endif
2983 }
2984
2985 static int mount_devices(
2986                 const char *where,
2987                 const char *root_device, bool root_device_rw,
2988                 const char *home_device, bool home_device_rw,
2989                 const char *srv_device, bool srv_device_rw) {
2990         int r;
2991
2992         assert(where);
2993
2994         if (root_device) {
2995                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2996                 if (r < 0)
2997                         return log_error_errno(r, "Failed to mount root directory: %m");
2998         }
2999
3000         if (home_device) {
3001                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3002                 if (r < 0)
3003                         return log_error_errno(r, "Failed to mount home directory: %m");
3004         }
3005
3006         if (srv_device) {
3007                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3008                 if (r < 0)
3009                         return log_error_errno(r, "Failed to mount server data directory: %m");
3010         }
3011
3012         return 0;
3013 }
3014
3015 static void loop_remove(int nr, int *image_fd) {
3016         _cleanup_close_ int control = -1;
3017         int r;
3018
3019         if (nr < 0)
3020                 return;
3021
3022         if (image_fd && *image_fd >= 0) {
3023                 r = ioctl(*image_fd, LOOP_CLR_FD);
3024                 if (r < 0)
3025                         log_debug_errno(errno, "Failed to close loop image: %m");
3026                 *image_fd = safe_close(*image_fd);
3027         }
3028
3029         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3030         if (control < 0) {
3031                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3032                 return;
3033         }
3034
3035         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3036         if (r < 0)
3037                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3038 }
3039
3040 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3041         int pipe_fds[2];
3042         pid_t pid;
3043
3044         assert(database);
3045         assert(key);
3046         assert(rpid);
3047
3048         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3049                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3050
3051         pid = fork();
3052         if (pid < 0)
3053                 return log_error_errno(errno, "Failed to fork getent child: %m");
3054         else if (pid == 0) {
3055                 int nullfd;
3056                 char *empty_env = NULL;
3057
3058                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3059                         _exit(EXIT_FAILURE);
3060
3061                 if (pipe_fds[0] > 2)
3062                         safe_close(pipe_fds[0]);
3063                 if (pipe_fds[1] > 2)
3064                         safe_close(pipe_fds[1]);
3065
3066                 nullfd = open("/dev/null", O_RDWR);
3067                 if (nullfd < 0)
3068                         _exit(EXIT_FAILURE);
3069
3070                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3071                         _exit(EXIT_FAILURE);
3072
3073                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3074                         _exit(EXIT_FAILURE);
3075
3076                 if (nullfd > 2)
3077                         safe_close(nullfd);
3078
3079                 reset_all_signal_handlers();
3080                 close_all_fds(NULL, 0);
3081
3082                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3083                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3084                 _exit(EXIT_FAILURE);
3085         }
3086
3087         pipe_fds[1] = safe_close(pipe_fds[1]);
3088
3089         *rpid = pid;
3090
3091         return pipe_fds[0];
3092 }
3093
3094 static int change_uid_gid(char **_home) {
3095         char line[LINE_MAX], *x, *u, *g, *h;
3096         const char *word, *state;
3097         _cleanup_free_ uid_t *uids = NULL;
3098         _cleanup_free_ char *home = NULL;
3099         _cleanup_fclose_ FILE *f = NULL;
3100         _cleanup_close_ int fd = -1;
3101         unsigned n_uids = 0;
3102         size_t sz = 0, l;
3103         uid_t uid;
3104         gid_t gid;
3105         pid_t pid;
3106         int r;
3107
3108         assert(_home);
3109
3110         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3111                 /* Reset everything fully to 0, just in case */
3112
3113                 if (setgroups(0, NULL) < 0)
3114                         return log_error_errno(errno, "setgroups() failed: %m");
3115
3116                 if (setresgid(0, 0, 0) < 0)
3117                         return log_error_errno(errno, "setregid() failed: %m");
3118
3119                 if (setresuid(0, 0, 0) < 0)
3120                         return log_error_errno(errno, "setreuid() failed: %m");
3121
3122                 *_home = NULL;
3123                 return 0;
3124         }
3125
3126         /* First, get user credentials */
3127         fd = spawn_getent("passwd", arg_user, &pid);
3128         if (fd < 0)
3129                 return fd;
3130
3131         f = fdopen(fd, "r");
3132         if (!f)
3133                 return log_oom();
3134         fd = -1;
3135
3136         if (!fgets(line, sizeof(line), f)) {
3137
3138                 if (!ferror(f)) {
3139                         log_error("Failed to resolve user %s.", arg_user);
3140                         return -ESRCH;
3141                 }
3142
3143                 log_error_errno(errno, "Failed to read from getent: %m");
3144                 return -errno;
3145         }
3146
3147         truncate_nl(line);
3148
3149         wait_for_terminate_and_warn("getent passwd", pid, true);
3150
3151         x = strchr(line, ':');
3152         if (!x) {
3153                 log_error("/etc/passwd entry has invalid user field.");
3154                 return -EIO;
3155         }
3156
3157         u = strchr(x+1, ':');
3158         if (!u) {
3159                 log_error("/etc/passwd entry has invalid password field.");
3160                 return -EIO;
3161         }
3162
3163         u++;
3164         g = strchr(u, ':');
3165         if (!g) {
3166                 log_error("/etc/passwd entry has invalid UID field.");
3167                 return -EIO;
3168         }
3169
3170         *g = 0;
3171         g++;
3172         x = strchr(g, ':');
3173         if (!x) {
3174                 log_error("/etc/passwd entry has invalid GID field.");
3175                 return -EIO;
3176         }
3177
3178         *x = 0;
3179         h = strchr(x+1, ':');
3180         if (!h) {
3181                 log_error("/etc/passwd entry has invalid GECOS field.");
3182                 return -EIO;
3183         }
3184
3185         h++;
3186         x = strchr(h, ':');
3187         if (!x) {
3188                 log_error("/etc/passwd entry has invalid home directory field.");
3189                 return -EIO;
3190         }
3191
3192         *x = 0;
3193
3194         r = parse_uid(u, &uid);
3195         if (r < 0) {
3196                 log_error("Failed to parse UID of user.");
3197                 return -EIO;
3198         }
3199
3200         r = parse_gid(g, &gid);
3201         if (r < 0) {
3202                 log_error("Failed to parse GID of user.");
3203                 return -EIO;
3204         }
3205
3206         home = strdup(h);
3207         if (!home)
3208                 return log_oom();
3209
3210         /* Second, get group memberships */
3211         fd = spawn_getent("initgroups", arg_user, &pid);
3212         if (fd < 0)
3213                 return fd;
3214
3215         fclose(f);
3216         f = fdopen(fd, "r");
3217         if (!f)
3218                 return log_oom();
3219         fd = -1;
3220
3221         if (!fgets(line, sizeof(line), f)) {
3222                 if (!ferror(f)) {
3223                         log_error("Failed to resolve user %s.", arg_user);
3224                         return -ESRCH;
3225                 }
3226
3227                 log_error_errno(errno, "Failed to read from getent: %m");
3228                 return -errno;
3229         }
3230
3231         truncate_nl(line);
3232
3233         wait_for_terminate_and_warn("getent initgroups", pid, true);
3234
3235         /* Skip over the username and subsequent separator whitespace */
3236         x = line;
3237         x += strcspn(x, WHITESPACE);
3238         x += strspn(x, WHITESPACE);
3239
3240         FOREACH_WORD(word, l, x, state) {
3241                 char c[l+1];
3242
3243                 memcpy(c, word, l);
3244                 c[l] = 0;
3245
3246                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3247                         return log_oom();
3248
3249                 r = parse_uid(c, &uids[n_uids++]);
3250                 if (r < 0) {
3251                         log_error("Failed to parse group data from getent.");
3252                         return -EIO;
3253                 }
3254         }
3255
3256         r = mkdir_parents(home, 0775);
3257         if (r < 0)
3258                 return log_error_errno(r, "Failed to make home root directory: %m");
3259
3260         r = mkdir_safe(home, 0755, uid, gid);
3261         if (r < 0 && r != -EEXIST)
3262                 return log_error_errno(r, "Failed to make home directory: %m");
3263
3264         fchown(STDIN_FILENO, uid, gid);
3265         fchown(STDOUT_FILENO, uid, gid);
3266         fchown(STDERR_FILENO, uid, gid);
3267
3268         if (setgroups(n_uids, uids) < 0)
3269                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3270
3271         if (setresgid(gid, gid, gid) < 0)
3272                 return log_error_errno(errno, "setregid() failed: %m");
3273
3274         if (setresuid(uid, uid, uid) < 0)
3275                 return log_error_errno(errno, "setreuid() failed: %m");
3276
3277         if (_home) {
3278                 *_home = home;
3279                 home = NULL;
3280         }
3281
3282         return 0;
3283 }
3284
3285 /*
3286  * Return values:
3287  * < 0 : wait_for_terminate() failed to get the state of the
3288  *       container, the container was terminated by a signal, or
3289  *       failed for an unknown reason.  No change is made to the
3290  *       container argument.
3291  * > 0 : The program executed in the container terminated with an
3292  *       error.  The exit code of the program executed in the
3293  *       container is returned.  The container argument has been set
3294  *       to CONTAINER_TERMINATED.
3295  *   0 : The container is being rebooted, has been shut down or exited
3296  *       successfully.  The container argument has been set to either
3297  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3298  *
3299  * That is, success is indicated by a return value of zero, and an
3300  * error is indicated by a non-zero value.
3301  */
3302 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3303         siginfo_t status;
3304         int r;
3305
3306         r = wait_for_terminate(pid, &status);
3307         if (r < 0)
3308                 return log_warning_errno(r, "Failed to wait for container: %m");
3309
3310         switch (status.si_code) {
3311
3312         case CLD_EXITED:
3313                 if (status.si_status == 0) {
3314                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3315
3316                 } else
3317                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3318
3319                 *container = CONTAINER_TERMINATED;
3320                 return status.si_status;
3321
3322         case CLD_KILLED:
3323                 if (status.si_status == SIGINT) {
3324
3325                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3326                         *container = CONTAINER_TERMINATED;
3327                         return 0;
3328
3329                 } else if (status.si_status == SIGHUP) {
3330
3331                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3332                         *container = CONTAINER_REBOOTED;
3333                         return 0;
3334                 }
3335
3336                 /* CLD_KILLED fallthrough */
3337
3338         case CLD_DUMPED:
3339                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3340                 return -EIO;
3341
3342         default:
3343                 log_error("Container %s failed due to unknown reason.", arg_machine);
3344                 return -EIO;
3345         }
3346
3347         return r;
3348 }
3349
3350 static void nop_handler(int sig) {}
3351
3352 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3353         pid_t pid;
3354
3355         pid = PTR_TO_UINT32(userdata);
3356         if (pid > 0) {
3357                 if (kill(pid, SIGRTMIN+3) >= 0) {
3358                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3359                         sd_event_source_set_userdata(s, NULL);
3360                         return 0;
3361                 }
3362         }
3363
3364         sd_event_exit(sd_event_source_get_event(s), 0);
3365         return 0;
3366 }
3367
3368 static int determine_names(void) {
3369         int r;
3370
3371         if (!arg_image && !arg_directory) {
3372                 if (arg_machine) {
3373                         _cleanup_(image_unrefp) Image *i = NULL;
3374
3375                         r = image_find(arg_machine, &i);
3376                         if (r < 0)
3377                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3378                         else if (r == 0) {
3379                                 log_error("No image for machine '%s': %m", arg_machine);
3380                                 return -ENOENT;
3381                         }
3382
3383                         if (i->type == IMAGE_RAW)
3384                                 r = set_sanitized_path(&arg_image, i->path);
3385                         else
3386                                 r = set_sanitized_path(&arg_directory, i->path);
3387                         if (r < 0)
3388                                 return log_error_errno(r, "Invalid image directory: %m");
3389
3390                         arg_read_only = arg_read_only || i->read_only;
3391                 } else
3392                         arg_directory = get_current_dir_name();
3393
3394                 if (!arg_directory && !arg_machine) {
3395                         log_error("Failed to determine path, please use -D or -i.");
3396                         return -EINVAL;
3397                 }
3398         }
3399
3400         if (!arg_machine) {
3401                 if (arg_directory && path_equal(arg_directory, "/"))
3402                         arg_machine = gethostname_malloc();
3403                 else
3404                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3405
3406                 if (!arg_machine)
3407                         return log_oom();
3408
3409                 hostname_cleanup(arg_machine, false);
3410                 if (!machine_name_is_valid(arg_machine)) {
3411                         log_error("Failed to determine machine name automatically, please use -M.");
3412                         return -EINVAL;
3413                 }
3414
3415                 if (arg_ephemeral) {
3416                         char *b;
3417
3418                         /* Add a random suffix when this is an
3419                          * ephemeral machine, so that we can run many
3420                          * instances at once without manually having
3421                          * to specify -M each time. */
3422
3423                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3424                                 return log_oom();
3425
3426                         free(arg_machine);
3427                         arg_machine = b;
3428                 }
3429         }
3430
3431         return 0;
3432 }
3433
3434 int main(int argc, char *argv[]) {
3435
3436         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3437         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3438         _cleanup_close_ int master = -1, image_fd = -1;
3439         _cleanup_fdset_free_ FDSet *fds = NULL;
3440         int r, n_fd_passed, loop_nr = -1;
3441         char veth_name[IFNAMSIZ];
3442         bool secondary = false, remove_subvol = false;
3443         sigset_t mask, mask_chld;
3444         pid_t pid = 0;
3445         int ret = EXIT_SUCCESS;
3446         union in_addr_union exposed = {};
3447         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3448
3449         log_parse_environment();
3450         log_open();
3451
3452         r = parse_argv(argc, argv);
3453         if (r <= 0)
3454                 goto finish;
3455
3456         r = determine_names();
3457         if (r < 0)
3458                 goto finish;
3459
3460         if (geteuid() != 0) {
3461                 log_error("Need to be root.");
3462                 r = -EPERM;
3463                 goto finish;
3464         }
3465
3466         if (sd_booted() <= 0) {
3467                 log_error("Not running on a systemd system.");
3468                 r = -EINVAL;
3469                 goto finish;
3470         }
3471
3472         log_close();
3473         n_fd_passed = sd_listen_fds(false);
3474         if (n_fd_passed > 0) {
3475                 r = fdset_new_listen_fds(&fds, false);
3476                 if (r < 0) {
3477                         log_error_errno(r, "Failed to collect file descriptors: %m");
3478                         goto finish;
3479                 }
3480         }
3481         fdset_close_others(fds);
3482         log_open();
3483
3484         if (arg_directory) {
3485                 assert(!arg_image);
3486
3487                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3488                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3489                         r = -EINVAL;
3490                         goto finish;
3491                 }
3492
3493                 if (arg_ephemeral) {
3494                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3495                         char *np;
3496
3497                         /* If the specified path is a mount point we
3498                          * generate the new snapshot immediately
3499                          * inside it under a random name. However if
3500                          * the specified is not a mount point we
3501                          * create the new snapshot in the parent
3502                          * directory, just next to it. */
3503                         r = path_is_mount_point(arg_directory, false);
3504                         if (r < 0) {
3505                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3506                                 goto finish;
3507                         }
3508                         if (r > 0)
3509                                 r = tempfn_random_child(arg_directory, &np);
3510                         else
3511                                 r = tempfn_random(arg_directory, &np);
3512                         if (r < 0) {
3513                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3514                                 goto finish;
3515                         }
3516
3517                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3518                         if (r < 0) {
3519                                 log_error_errno(r, "Failed to lock %s: %m", np);
3520                                 goto finish;
3521                         }
3522
3523                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3524                         if (r < 0) {
3525                                 free(np);
3526                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3527                                 goto finish;
3528                         }
3529
3530                         free(arg_directory);
3531                         arg_directory = np;
3532
3533                         remove_subvol = true;
3534
3535                 } else {
3536                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3537                         if (r == -EBUSY) {
3538                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3539                                 goto finish;
3540                         }
3541                         if (r < 0) {
3542                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3543                                 return r;
3544                         }
3545
3546                         if (arg_template) {
3547                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3548                                 if (r == -EEXIST) {
3549                                         if (!arg_quiet)
3550                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3551                                 } else if (r < 0) {
3552                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3553                                         goto finish;
3554                                 } else {
3555                                         if (!arg_quiet)
3556                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3557                                 }
3558                         }
3559                 }
3560
3561                 if (arg_boot) {
3562                         if (path_is_os_tree(arg_directory) <= 0) {
3563                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3564                                 r = -EINVAL;
3565                                 goto finish;
3566                         }
3567                 } else {
3568                         const char *p;
3569
3570                         p = strappenda(arg_directory,
3571                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3572                         if (access(p, F_OK) < 0) {
3573                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3574                                 r = -EINVAL;
3575                                 goto finish;
3576                         }
3577                 }
3578
3579         } else {
3580                 char template[] = "/tmp/nspawn-root-XXXXXX";
3581
3582                 assert(arg_image);
3583                 assert(!arg_template);
3584
3585                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3586                 if (r == -EBUSY) {
3587                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3588                         goto finish;
3589                 }
3590                 if (r < 0) {
3591                         r = log_error_errno(r, "Failed to create image lock: %m");
3592                         goto finish;
3593                 }
3594
3595                 if (!mkdtemp(template)) {
3596                         log_error_errno(errno, "Failed to create temporary directory: %m");
3597                         r = -errno;
3598                         goto finish;
3599                 }
3600
3601                 arg_directory = strdup(template);
3602                 if (!arg_directory) {
3603                         r = log_oom();
3604                         goto finish;
3605                 }
3606
3607                 image_fd = setup_image(&device_path, &loop_nr);
3608                 if (image_fd < 0) {
3609                         r = image_fd;
3610                         goto finish;
3611                 }
3612
3613                 r = dissect_image(image_fd,
3614                                   &root_device, &root_device_rw,
3615                                   &home_device, &home_device_rw,
3616                                   &srv_device, &srv_device_rw,
3617                                   &secondary);
3618                 if (r < 0)
3619                         goto finish;
3620         }
3621
3622         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3623         if (master < 0) {
3624                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3625                 goto finish;
3626         }
3627
3628         r = ptsname_malloc(master, &console);
3629         if (r < 0) {
3630                 r = log_error_errno(r, "Failed to determine tty name: %m");
3631                 goto finish;
3632         }
3633
3634         if (!arg_quiet)
3635                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3636                          arg_machine, arg_image ?: arg_directory);
3637
3638         if (unlockpt(master) < 0) {
3639                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3640                 goto finish;
3641         }
3642
3643         assert_se(sigemptyset(&mask) == 0);
3644         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3645         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3646
3647         assert_se(sigemptyset(&mask_chld) == 0);
3648         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3649
3650         for (;;) {
3651                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3652                 ContainerStatus container_status;
3653                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3654                 struct sigaction sa = {
3655                         .sa_handler = nop_handler,
3656                         .sa_flags = SA_NOCLDSTOP,
3657                 };
3658
3659                 r = barrier_create(&barrier);
3660                 if (r < 0) {
3661                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3662                         goto finish;
3663                 }
3664
3665                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3666                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3667                         goto finish;
3668                 }
3669
3670                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3671                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3672                         goto finish;
3673                 }
3674
3675                 /* Child can be killed before execv(), so handle SIGCHLD
3676                  * in order to interrupt parent's blocking calls and
3677                  * give it a chance to call wait() and terminate. */
3678                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3679                 if (r < 0) {
3680                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3681                         goto finish;
3682                 }
3683
3684                 r = sigaction(SIGCHLD, &sa, NULL);
3685                 if (r < 0) {
3686                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3687                         goto finish;
3688                 }
3689
3690                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3691                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3692                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3693                 if (pid < 0) {
3694                         if (errno == EINVAL)
3695                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3696                         else
3697                                 r = log_error_errno(errno, "clone() failed: %m");
3698
3699                         goto finish;
3700                 }
3701
3702                 if (pid == 0) {
3703                         /* child */
3704                         _cleanup_free_ char *home = NULL;
3705                         unsigned n_env = 2;
3706                         const char *envp[] = {
3707                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3708                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3709                                 NULL, /* TERM */
3710                                 NULL, /* HOME */
3711                                 NULL, /* USER */
3712                                 NULL, /* LOGNAME */
3713                                 NULL, /* container_uuid */
3714                                 NULL, /* LISTEN_FDS */
3715                                 NULL, /* LISTEN_PID */
3716                                 NULL
3717                         };
3718                         char **env_use;
3719
3720                         barrier_set_role(&barrier, BARRIER_CHILD);
3721
3722                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3723                         if (envp[n_env])
3724                                 n_env ++;
3725
3726                         master = safe_close(master);
3727
3728                         close_nointr(STDIN_FILENO);
3729                         close_nointr(STDOUT_FILENO);
3730                         close_nointr(STDERR_FILENO);
3731
3732                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3733                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3734
3735                         reset_all_signal_handlers();
3736                         reset_signal_mask();
3737
3738                         r = open_terminal(console, O_RDWR);
3739                         if (r != STDIN_FILENO) {
3740                                 if (r >= 0) {
3741                                         safe_close(r);
3742                                         r = -EINVAL;
3743                                 }
3744
3745                                 log_error_errno(r, "Failed to open console: %m");
3746                                 _exit(EXIT_FAILURE);
3747                         }
3748
3749                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3750                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3751                                 log_error_errno(errno, "Failed to duplicate console: %m");
3752                                 _exit(EXIT_FAILURE);
3753                         }
3754
3755                         if (setsid() < 0) {
3756                                 log_error_errno(errno, "setsid() failed: %m");
3757                                 _exit(EXIT_FAILURE);
3758                         }
3759
3760                         if (reset_audit_loginuid() < 0)
3761                                 _exit(EXIT_FAILURE);
3762
3763                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3764                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3765                                 _exit(EXIT_FAILURE);
3766                         }
3767
3768                         /* Mark everything as slave, so that we still
3769                          * receive mounts from the real root, but don't
3770                          * propagate mounts to the real root. */
3771                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3772                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3773                                 _exit(EXIT_FAILURE);
3774                         }
3775
3776                         if (mount_devices(arg_directory,
3777                                           root_device, root_device_rw,
3778                                           home_device, home_device_rw,
3779                                           srv_device, srv_device_rw) < 0)
3780                                 _exit(EXIT_FAILURE);
3781
3782                         /* Turn directory into bind mount */
3783                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3784                                 log_error_errno(errno, "Failed to make bind mount: %m");
3785                                 _exit(EXIT_FAILURE);
3786                         }
3787
3788                         r = setup_volatile(arg_directory);
3789                         if (r < 0)
3790                                 _exit(EXIT_FAILURE);
3791
3792                         if (setup_volatile_state(arg_directory) < 0)
3793                                 _exit(EXIT_FAILURE);
3794
3795                         r = base_filesystem_create(arg_directory);
3796                         if (r < 0)
3797                                 _exit(EXIT_FAILURE);
3798
3799                         if (arg_read_only) {
3800                                 r = bind_remount_recursive(arg_directory, true);
3801                                 if (r < 0) {
3802                                         log_error_errno(r, "Failed to make tree read-only: %m");
3803                                         _exit(EXIT_FAILURE);
3804                                 }
3805                         }
3806
3807                         if (mount_all(arg_directory) < 0)
3808                                 _exit(EXIT_FAILURE);
3809
3810                         if (copy_devnodes(arg_directory) < 0)
3811                                 _exit(EXIT_FAILURE);
3812
3813                         if (setup_ptmx(arg_directory) < 0)
3814                                 _exit(EXIT_FAILURE);
3815
3816                         dev_setup(arg_directory);
3817
3818                         if (setup_propagate(arg_directory) < 0)
3819                                 _exit(EXIT_FAILURE);
3820
3821                         if (setup_seccomp() < 0)
3822                                 _exit(EXIT_FAILURE);
3823
3824                         if (setup_dev_console(arg_directory, console) < 0)
3825                                 _exit(EXIT_FAILURE);
3826
3827                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3828                                 _exit(EXIT_FAILURE);
3829                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3830
3831                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3832                                 _exit(EXIT_FAILURE);
3833                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3834
3835                         /* Tell the parent that we are ready, and that
3836                          * it can cgroupify us to that we lack access
3837                          * to certain devices and resources. */
3838                         (void) barrier_place(&barrier);
3839
3840                         if (setup_boot_id(arg_directory) < 0)
3841                                 _exit(EXIT_FAILURE);
3842
3843                         if (setup_timezone(arg_directory) < 0)
3844                                 _exit(EXIT_FAILURE);
3845
3846                         if (setup_resolv_conf(arg_directory) < 0)
3847                                 _exit(EXIT_FAILURE);
3848
3849                         if (setup_journal(arg_directory) < 0)
3850                                 _exit(EXIT_FAILURE);
3851
3852                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3853                                 _exit(EXIT_FAILURE);
3854
3855                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3856                                 _exit(EXIT_FAILURE);
3857
3858                         if (mount_tmpfs(arg_directory) < 0)
3859                                 _exit(EXIT_FAILURE);
3860
3861                         /* Wait until we are cgroup-ified, so that we
3862                          * can mount the right cgroup path writable */
3863                         (void) barrier_sync_next(&barrier);
3864
3865                         if (mount_cgroup(arg_directory) < 0)
3866                                 _exit(EXIT_FAILURE);
3867
3868                         if (chdir(arg_directory) < 0) {
3869                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3870                                 _exit(EXIT_FAILURE);
3871                         }
3872
3873                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3874                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3875                                 _exit(EXIT_FAILURE);
3876                         }
3877
3878                         if (chroot(".") < 0) {
3879                                 log_error_errno(errno, "chroot() failed: %m");
3880                                 _exit(EXIT_FAILURE);
3881                         }
3882
3883                         if (chdir("/") < 0) {
3884                                 log_error_errno(errno, "chdir() failed: %m");
3885                                 _exit(EXIT_FAILURE);
3886                         }
3887
3888                         umask(0022);
3889
3890                         if (arg_private_network)
3891                                 loopback_setup();
3892
3893                         if (drop_capabilities() < 0) {
3894                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3895                                 _exit(EXIT_FAILURE);
3896                         }
3897
3898                         r = change_uid_gid(&home);
3899                         if (r < 0)
3900                                 _exit(EXIT_FAILURE);
3901
3902                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3903                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3904                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3905                                 log_oom();
3906                                 _exit(EXIT_FAILURE);
3907                         }
3908
3909                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3910                                 char as_uuid[37];
3911
3912                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3913                                         log_oom();
3914                                         _exit(EXIT_FAILURE);
3915                                 }
3916                         }
3917
3918                         if (fdset_size(fds) > 0) {
3919                                 r = fdset_cloexec(fds, false);
3920                                 if (r < 0) {
3921                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3922                                         _exit(EXIT_FAILURE);
3923                                 }
3924
3925                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3926                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3927                                         log_oom();
3928                                         _exit(EXIT_FAILURE);
3929                                 }
3930                         }
3931
3932                         setup_hostname();
3933
3934                         if (arg_personality != 0xffffffffLU) {
3935                                 if (personality(arg_personality) < 0) {
3936                                         log_error_errno(errno, "personality() failed: %m");
3937                                         _exit(EXIT_FAILURE);
3938                                 }
3939                         } else if (secondary) {
3940                                 if (personality(PER_LINUX32) < 0) {
3941                                         log_error_errno(errno, "personality() failed: %m");
3942                                         _exit(EXIT_FAILURE);
3943                                 }
3944                         }
3945
3946 #ifdef HAVE_SELINUX
3947                         if (arg_selinux_context)
3948                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3949                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3950                                         _exit(EXIT_FAILURE);
3951                                 }
3952 #endif
3953
3954                         if (!strv_isempty(arg_setenv)) {
3955                                 char **n;
3956
3957                                 n = strv_env_merge(2, envp, arg_setenv);
3958                                 if (!n) {
3959                                         log_oom();
3960                                         _exit(EXIT_FAILURE);
3961                                 }
3962
3963                                 env_use = n;
3964                         } else
3965                                 env_use = (char**) envp;
3966
3967                         /* Wait until the parent is ready with the setup, too... */
3968                         if (!barrier_place_and_sync(&barrier))
3969                                 _exit(EXIT_FAILURE);
3970
3971                         if (arg_boot) {
3972                                 char **a;
3973                                 size_t l;
3974
3975                                 /* Automatically search for the init system */
3976
3977                                 l = 1 + argc - optind;
3978                                 a = newa(char*, l + 1);
3979                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3980
3981                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3982                                 execve(a[0], a, env_use);
3983
3984                                 a[0] = (char*) "/lib/systemd/systemd";
3985                                 execve(a[0], a, env_use);
3986
3987                                 a[0] = (char*) "/sbin/init";
3988                                 execve(a[0], a, env_use);
3989                         } else if (argc > optind)
3990                                 execvpe(argv[optind], argv + optind, env_use);
3991                         else {
3992                                 chdir(home ? home : "/root");
3993                                 execle("/bin/bash", "-bash", NULL, env_use);
3994                                 execle("/bin/sh", "-sh", NULL, env_use);
3995                         }
3996
3997                         log_error_errno(errno, "execv() failed: %m");
3998                         _exit(EXIT_FAILURE);
3999                 }
4000
4001                 barrier_set_role(&barrier, BARRIER_PARENT);
4002                 fdset_free(fds);
4003                 fds = NULL;
4004
4005                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4006                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4007
4008                 /* Wait for the most basic Child-setup to be done,
4009                  * before we add hardware to it, and place it in a
4010                  * cgroup. */
4011                 if (barrier_sync_next(&barrier)) {
4012                         int ifi = 0;
4013
4014                         r = move_network_interfaces(pid);
4015                         if (r < 0)
4016                                 goto finish;
4017
4018                         r = setup_veth(pid, veth_name, &ifi);
4019                         if (r < 0)
4020                                 goto finish;
4021
4022                         r = setup_bridge(veth_name, &ifi);
4023                         if (r < 0)
4024                                 goto finish;
4025
4026                         r = setup_macvlan(pid);
4027                         if (r < 0)
4028                                 goto finish;
4029
4030                         r = register_machine(pid, ifi);
4031                         if (r < 0)
4032                                 goto finish;
4033
4034                         /* Block SIGCHLD here, before notifying child.
4035                          * process_pty() will handle it with the other signals. */
4036                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4037                         if (r < 0)
4038                                 goto finish;
4039
4040                         /* Reset signal to default */
4041                         r = default_signals(SIGCHLD, -1);
4042                         if (r < 0)
4043                                 goto finish;
4044
4045                         /* Notify the child that the parent is ready with all
4046                          * its setup, and that the child can now hand over
4047                          * control to the code to run inside the container. */
4048                         (void) barrier_place(&barrier);
4049
4050                         /* And wait that the child is completely ready now. */
4051                         if (barrier_place_and_sync(&barrier)) {
4052                                 _cleanup_event_unref_ sd_event *event = NULL;
4053                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4054                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4055                                 char last_char = 0;
4056
4057                                 sd_notifyf(false,
4058                                            "READY=1\n"
4059                                            "STATUS=Container running.\n"
4060                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4061
4062                                 r = sd_event_new(&event);
4063                                 if (r < 0) {
4064                                         log_error_errno(r, "Failed to get default event source: %m");
4065                                         goto finish;
4066                                 }
4067
4068                                 if (arg_boot) {
4069                                         /* Try to kill the init system on SIGINT or SIGTERM */
4070                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4071                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4072                                 } else {
4073                                         /* Immediately exit */
4074                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4075                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4076                                 }
4077
4078                                 /* simply exit on sigchld */
4079                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4080
4081                                 if (arg_expose_ports) {
4082                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4083                                         if (r < 0)
4084                                                 goto finish;
4085
4086                                         (void) expose_ports(rtnl, &exposed);
4087                                 }
4088
4089                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4090
4091                                 r = pty_forward_new(event, master, true, &forward);
4092                                 if (r < 0) {
4093                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4094                                         goto finish;
4095                                 }
4096
4097                                 r = sd_event_loop(event);
4098                                 if (r < 0) {
4099                                         log_error_errno(r, "Failed to run event loop: %m");
4100                                         goto finish;
4101                                 }
4102
4103                                 pty_forward_get_last_char(forward, &last_char);
4104
4105                                 forward = pty_forward_free(forward);
4106
4107                                 if (!arg_quiet && last_char != '\n')
4108                                         putc('\n', stdout);
4109
4110                                 /* Kill if it is not dead yet anyway */
4111                                 terminate_machine(pid);
4112                         }
4113                 }
4114
4115                 /* Normally redundant, but better safe than sorry */
4116                 kill(pid, SIGKILL);
4117
4118                 r = wait_for_container(pid, &container_status);
4119                 pid = 0;
4120
4121                 if (r < 0)
4122                         /* We failed to wait for the container, or the
4123                          * container exited abnormally */
4124                         goto finish;
4125                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4126                         /* The container exited with a non-zero
4127                          * status, or with zero status and no reboot
4128                          * was requested. */
4129                         ret = r;
4130                         break;
4131                 }
4132
4133                 /* CONTAINER_REBOOTED, loop again */
4134
4135                 if (arg_keep_unit) {
4136                         /* Special handling if we are running as a
4137                          * service: instead of simply restarting the
4138                          * machine we want to restart the entire
4139                          * service, so let's inform systemd about this
4140                          * with the special exit code 133. The service
4141                          * file uses RestartForceExitStatus=133 so
4142                          * that this results in a full nspawn
4143                          * restart. This is necessary since we might
4144                          * have cgroup parameters set we want to have
4145                          * flushed out. */
4146                         ret = 133;
4147                         r = 0;
4148                         break;
4149                 }
4150
4151                 flush_ports(&exposed);
4152         }
4153
4154 finish:
4155         sd_notify(false,
4156                   "STOPPING=1\n"
4157                   "STATUS=Terminating...");
4158
4159         loop_remove(loop_nr, &image_fd);
4160
4161         if (pid > 0)
4162                 kill(pid, SIGKILL);
4163
4164         if (remove_subvol && arg_directory) {
4165                 int k;
4166
4167                 k = btrfs_subvol_remove(arg_directory);
4168                 if (k < 0)
4169                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4170         }
4171
4172         if (arg_machine) {
4173                 const char *p;
4174
4175                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4176                 (void) rm_rf(p, false, true, false);
4177         }
4178
4179         free(arg_directory);
4180         free(arg_template);
4181         free(arg_image);
4182         free(arg_machine);
4183         free(arg_user);
4184         strv_free(arg_setenv);
4185         strv_free(arg_network_interfaces);
4186         strv_free(arg_network_macvlan);
4187         strv_free(arg_bind);
4188         strv_free(arg_bind_ro);
4189         strv_free(arg_tmpfs);
4190
4191         flush_ports(&exposed);
4192
4193         while (arg_expose_ports) {
4194                 ExposePort *p = arg_expose_ports;
4195                 LIST_REMOVE(ports, arg_expose_ports, p);
4196                 free(p);
4197         }
4198
4199         return r < 0 ? EXIT_FAILURE : ret;
4200 }