chiark / gitweb /
nspawn: support dissecting GPT images that contain only a single generic linux partition
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static bool arg_network_veth = false;
184 static const char *arg_network_bridge = NULL;
185 static unsigned long arg_personality = 0xffffffffLU;
186 static char *arg_image = NULL;
187 static Volatile arg_volatile = VOLATILE_NO;
188 static ExposePort *arg_expose_ports = NULL;
189
190 static void help(void) {
191         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
192                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
193                "  -h --help                 Show this help\n"
194                "     --version              Print version string\n"
195                "  -q --quiet                Do not show status information\n"
196                "  -D --directory=PATH       Root directory for the container\n"
197                "     --template=PATH        Initialize root directory from template directory,\n"
198                "                            if missing\n"
199                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
200                "                            remove it after exit\n"
201                "  -i --image=PATH           File system device or disk image for the container\n"
202                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
203                "  -u --user=USER            Run the command under specified user or uid\n"
204                "  -M --machine=NAME         Set the machine name for the container\n"
205                "     --uuid=UUID            Set a specific machine UUID for the container\n"
206                "  -S --slice=SLICE          Place the container in the specified slice\n"
207                "     --private-network      Disable network in container\n"
208                "     --network-interface=INTERFACE\n"
209                "                            Assign an existing network interface to the\n"
210                "                            container\n"
211                "     --network-macvlan=INTERFACE\n"
212                "                            Create a macvlan network interface based on an\n"
213                "                            existing network interface to the container\n"
214                "  -n --network-veth         Add a virtual ethernet connection between host\n"
215                "                            and container\n"
216                "     --network-bridge=INTERFACE\n"
217                "                            Add a virtual ethernet connection between host\n"
218                "                            and container and add it to an existing bridge on\n"
219                "                            the host\n"
220                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
221                "                            Expose a container IP port on the host\n"
222                "  -Z --selinux-context=SECLABEL\n"
223                "                            Set the SELinux security context to be used by\n"
224                "                            processes in the container\n"
225                "  -L --selinux-apifs-context=SECLABEL\n"
226                "                            Set the SELinux security context to be used by\n"
227                "                            API/tmpfs file systems in the container\n"
228                "     --capability=CAP       In addition to the default, retain specified\n"
229                "                            capability\n"
230                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
231                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
232                "                            try-guest, try-host\n"
233                "  -j                        Equivalent to --link-journal=try-guest\n"
234                "     --read-only            Mount the root directory read-only\n"
235                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
236                "                            the container\n"
237                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
238                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
239                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
240                "     --share-system         Share system namespaces with host\n"
241                "     --register=BOOLEAN     Register container as machine\n"
242                "     --keep-unit            Do not register a scope for the machine, reuse\n"
243                "                            the service unit nspawn is running in\n"
244                "     --volatile[=MODE]      Run the system in volatile mode\n"
245                , program_invocation_short_name);
246 }
247
248 static int set_sanitized_path(char **b, const char *path) {
249         char *p;
250
251         assert(b);
252         assert(path);
253
254         p = canonicalize_file_name(path);
255         if (!p) {
256                 if (errno != ENOENT)
257                         return -errno;
258
259                 p = path_make_absolute_cwd(path);
260                 if (!p)
261                         return -ENOMEM;
262         }
263
264         free(*b);
265         *b = path_kill_slashes(p);
266         return 0;
267 }
268
269 static int parse_argv(int argc, char *argv[]) {
270
271         enum {
272                 ARG_VERSION = 0x100,
273                 ARG_PRIVATE_NETWORK,
274                 ARG_UUID,
275                 ARG_READ_ONLY,
276                 ARG_CAPABILITY,
277                 ARG_DROP_CAPABILITY,
278                 ARG_LINK_JOURNAL,
279                 ARG_BIND,
280                 ARG_BIND_RO,
281                 ARG_TMPFS,
282                 ARG_SETENV,
283                 ARG_SHARE_SYSTEM,
284                 ARG_REGISTER,
285                 ARG_KEEP_UNIT,
286                 ARG_NETWORK_INTERFACE,
287                 ARG_NETWORK_MACVLAN,
288                 ARG_NETWORK_BRIDGE,
289                 ARG_PERSONALITY,
290                 ARG_VOLATILE,
291                 ARG_TEMPLATE,
292         };
293
294         static const struct option options[] = {
295                 { "help",                  no_argument,       NULL, 'h'                   },
296                 { "version",               no_argument,       NULL, ARG_VERSION           },
297                 { "directory",             required_argument, NULL, 'D'                   },
298                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
299                 { "ephemeral",             no_argument,       NULL, 'x'                   },
300                 { "user",                  required_argument, NULL, 'u'                   },
301                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
302                 { "boot",                  no_argument,       NULL, 'b'                   },
303                 { "uuid",                  required_argument, NULL, ARG_UUID              },
304                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
305                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
306                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
307                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
308                 { "bind",                  required_argument, NULL, ARG_BIND              },
309                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
310                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
311                 { "machine",               required_argument, NULL, 'M'                   },
312                 { "slice",                 required_argument, NULL, 'S'                   },
313                 { "setenv",                required_argument, NULL, ARG_SETENV            },
314                 { "selinux-context",       required_argument, NULL, 'Z'                   },
315                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
316                 { "quiet",                 no_argument,       NULL, 'q'                   },
317                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
318                 { "register",              required_argument, NULL, ARG_REGISTER          },
319                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
320                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
321                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
322                 { "network-veth",          no_argument,       NULL, 'n'                   },
323                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
324                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
325                 { "image",                 required_argument, NULL, 'i'                   },
326                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
327                 { "port",                  required_argument, NULL, 'p'                   },
328                 {}
329         };
330
331         int c, r;
332         uint64_t plus = 0, minus = 0;
333
334         assert(argc >= 0);
335         assert(argv);
336
337         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
338
339                 switch (c) {
340
341                 case 'h':
342                         help();
343                         return 0;
344
345                 case ARG_VERSION:
346                         puts(PACKAGE_STRING);
347                         puts(SYSTEMD_FEATURES);
348                         return 0;
349
350                 case 'D':
351                         r = set_sanitized_path(&arg_directory, optarg);
352                         if (r < 0)
353                                 return log_error_errno(r, "Invalid root directory: %m");
354
355                         break;
356
357                 case ARG_TEMPLATE:
358                         r = set_sanitized_path(&arg_template, optarg);
359                         if (r < 0)
360                                 return log_error_errno(r, "Invalid template directory: %m");
361
362                         break;
363
364                 case 'i':
365                         r = set_sanitized_path(&arg_image, optarg);
366                         if (r < 0)
367                                 return log_error_errno(r, "Invalid image path: %m");
368
369                         break;
370
371                 case 'x':
372                         arg_ephemeral = true;
373                         break;
374
375                 case 'u':
376                         free(arg_user);
377                         arg_user = strdup(optarg);
378                         if (!arg_user)
379                                 return log_oom();
380
381                         break;
382
383                 case ARG_NETWORK_BRIDGE:
384                         arg_network_bridge = optarg;
385
386                         /* fall through */
387
388                 case 'n':
389                         arg_network_veth = true;
390                         arg_private_network = true;
391                         break;
392
393                 case ARG_NETWORK_INTERFACE:
394                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
395                                 return log_oom();
396
397                         arg_private_network = true;
398                         break;
399
400                 case ARG_NETWORK_MACVLAN:
401                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
402                                 return log_oom();
403
404                         /* fall through */
405
406                 case ARG_PRIVATE_NETWORK:
407                         arg_private_network = true;
408                         break;
409
410                 case 'b':
411                         arg_boot = true;
412                         break;
413
414                 case ARG_UUID:
415                         r = sd_id128_from_string(optarg, &arg_uuid);
416                         if (r < 0) {
417                                 log_error("Invalid UUID: %s", optarg);
418                                 return r;
419                         }
420                         break;
421
422                 case 'S':
423                         arg_slice = optarg;
424                         break;
425
426                 case 'M':
427                         if (isempty(optarg)) {
428                                 free(arg_machine);
429                                 arg_machine = NULL;
430                         } else {
431                                 if (!machine_name_is_valid(optarg)) {
432                                         log_error("Invalid machine name: %s", optarg);
433                                         return -EINVAL;
434                                 }
435
436                                 r = free_and_strdup(&arg_machine, optarg);
437                                 if (r < 0)
438                                         return log_oom();
439
440                                 break;
441                         }
442
443                 case 'Z':
444                         arg_selinux_context = optarg;
445                         break;
446
447                 case 'L':
448                         arg_selinux_apifs_context = optarg;
449                         break;
450
451                 case ARG_READ_ONLY:
452                         arg_read_only = true;
453                         break;
454
455                 case ARG_CAPABILITY:
456                 case ARG_DROP_CAPABILITY: {
457                         const char *state, *word;
458                         size_t length;
459
460                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
461                                 _cleanup_free_ char *t;
462
463                                 t = strndup(word, length);
464                                 if (!t)
465                                         return log_oom();
466
467                                 if (streq(t, "all")) {
468                                         if (c == ARG_CAPABILITY)
469                                                 plus = (uint64_t) -1;
470                                         else
471                                                 minus = (uint64_t) -1;
472                                 } else {
473                                         int cap;
474
475                                         cap = capability_from_name(t);
476                                         if (cap < 0) {
477                                                 log_error("Failed to parse capability %s.", t);
478                                                 return -EINVAL;
479                                         }
480
481                                         if (c == ARG_CAPABILITY)
482                                                 plus |= 1ULL << (uint64_t) cap;
483                                         else
484                                                 minus |= 1ULL << (uint64_t) cap;
485                                 }
486                         }
487
488                         break;
489                 }
490
491                 case 'j':
492                         arg_link_journal = LINK_GUEST;
493                         arg_link_journal_try = true;
494                         break;
495
496                 case ARG_LINK_JOURNAL:
497                         if (streq(optarg, "auto")) {
498                                 arg_link_journal = LINK_AUTO;
499                                 arg_link_journal_try = false;
500                         } else if (streq(optarg, "no")) {
501                                 arg_link_journal = LINK_NO;
502                                 arg_link_journal_try = false;
503                         } else if (streq(optarg, "guest")) {
504                                 arg_link_journal = LINK_GUEST;
505                                 arg_link_journal_try = false;
506                         } else if (streq(optarg, "host")) {
507                                 arg_link_journal = LINK_HOST;
508                                 arg_link_journal_try = false;
509                         } else if (streq(optarg, "try-guest")) {
510                                 arg_link_journal = LINK_GUEST;
511                                 arg_link_journal_try = true;
512                         } else if (streq(optarg, "try-host")) {
513                                 arg_link_journal = LINK_HOST;
514                                 arg_link_journal_try = true;
515                         } else {
516                                 log_error("Failed to parse link journal mode %s", optarg);
517                                 return -EINVAL;
518                         }
519
520                         break;
521
522                 case ARG_BIND:
523                 case ARG_BIND_RO: {
524                         _cleanup_free_ char *a = NULL, *b = NULL;
525                         char *e;
526                         char ***x;
527
528                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
529
530                         e = strchr(optarg, ':');
531                         if (e) {
532                                 a = strndup(optarg, e - optarg);
533                                 b = strdup(e + 1);
534                         } else {
535                                 a = strdup(optarg);
536                                 b = strdup(optarg);
537                         }
538
539                         if (!a || !b)
540                                 return log_oom();
541
542                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
543                                 log_error("Invalid bind mount specification: %s", optarg);
544                                 return -EINVAL;
545                         }
546
547                         r = strv_extend(x, a);
548                         if (r < 0)
549                                 return log_oom();
550
551                         r = strv_extend(x, b);
552                         if (r < 0)
553                                 return log_oom();
554
555                         break;
556                 }
557
558                 case ARG_TMPFS: {
559                         _cleanup_free_ char *a = NULL, *b = NULL;
560                         char *e;
561
562                         e = strchr(optarg, ':');
563                         if (e) {
564                                 a = strndup(optarg, e - optarg);
565                                 b = strdup(e + 1);
566                         } else {
567                                 a = strdup(optarg);
568                                 b = strdup("mode=0755");
569                         }
570
571                         if (!a || !b)
572                                 return log_oom();
573
574                         if (!path_is_absolute(a)) {
575                                 log_error("Invalid tmpfs specification: %s", optarg);
576                                 return -EINVAL;
577                         }
578
579                         r = strv_push(&arg_tmpfs, a);
580                         if (r < 0)
581                                 return log_oom();
582
583                         a = NULL;
584
585                         r = strv_push(&arg_tmpfs, b);
586                         if (r < 0)
587                                 return log_oom();
588
589                         b = NULL;
590
591                         break;
592                 }
593
594                 case ARG_SETENV: {
595                         char **n;
596
597                         if (!env_assignment_is_valid(optarg)) {
598                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
599                                 return -EINVAL;
600                         }
601
602                         n = strv_env_set(arg_setenv, optarg);
603                         if (!n)
604                                 return log_oom();
605
606                         strv_free(arg_setenv);
607                         arg_setenv = n;
608                         break;
609                 }
610
611                 case 'q':
612                         arg_quiet = true;
613                         break;
614
615                 case ARG_SHARE_SYSTEM:
616                         arg_share_system = true;
617                         break;
618
619                 case ARG_REGISTER:
620                         r = parse_boolean(optarg);
621                         if (r < 0) {
622                                 log_error("Failed to parse --register= argument: %s", optarg);
623                                 return r;
624                         }
625
626                         arg_register = r;
627                         break;
628
629                 case ARG_KEEP_UNIT:
630                         arg_keep_unit = true;
631                         break;
632
633                 case ARG_PERSONALITY:
634
635                         arg_personality = personality_from_string(optarg);
636                         if (arg_personality == 0xffffffffLU) {
637                                 log_error("Unknown or unsupported personality '%s'.", optarg);
638                                 return -EINVAL;
639                         }
640
641                         break;
642
643                 case ARG_VOLATILE:
644
645                         if (!optarg)
646                                 arg_volatile = VOLATILE_YES;
647                         else {
648                                 r = parse_boolean(optarg);
649                                 if (r < 0) {
650                                         if (streq(optarg, "state"))
651                                                 arg_volatile = VOLATILE_STATE;
652                                         else {
653                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
654                                                 return r;
655                                         }
656                                 } else
657                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
658                         }
659
660                         break;
661
662                 case 'p': {
663                         const char *split, *e;
664                         uint16_t container_port, host_port;
665                         int protocol;
666                         ExposePort *p;
667
668                         if ((e = startswith(optarg, "tcp:")))
669                                 protocol = IPPROTO_TCP;
670                         else if ((e = startswith(optarg, "udp:")))
671                                 protocol = IPPROTO_UDP;
672                         else {
673                                 e = optarg;
674                                 protocol = IPPROTO_TCP;
675                         }
676
677                         split = strchr(e, ':');
678                         if (split) {
679                                 char v[split - e + 1];
680
681                                 memcpy(v, e, split - e);
682                                 v[split - e] = 0;
683
684                                 r = safe_atou16(v, &host_port);
685                                 if (r < 0 || host_port <= 0) {
686                                         log_error("Failed to parse host port: %s", optarg);
687                                         return -EINVAL;
688                                 }
689
690                                 r = safe_atou16(split + 1, &container_port);
691                         } else {
692                                 r = safe_atou16(e, &container_port);
693                                 host_port = container_port;
694                         }
695
696                         if (r < 0 || container_port <= 0) {
697                                 log_error("Failed to parse host port: %s", optarg);
698                                 return -EINVAL;
699                         }
700
701                         LIST_FOREACH(ports, p, arg_expose_ports) {
702                                 if (p->protocol == protocol && p->host_port == host_port) {
703                                         log_error("Duplicate port specification: %s", optarg);
704                                         return -EINVAL;
705                                 }
706                         }
707
708                         p = new(ExposePort, 1);
709                         if (!p)
710                                 return log_oom();
711
712                         p->protocol = protocol;
713                         p->host_port = host_port;
714                         p->container_port = container_port;
715
716                         LIST_PREPEND(ports, arg_expose_ports, p);
717
718                         break;
719                 }
720
721                 case '?':
722                         return -EINVAL;
723
724                 default:
725                         assert_not_reached("Unhandled option");
726                 }
727
728         if (arg_share_system)
729                 arg_register = false;
730
731         if (arg_boot && arg_share_system) {
732                 log_error("--boot and --share-system may not be combined.");
733                 return -EINVAL;
734         }
735
736         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
737                 log_error("--keep-unit may not be used when invoked from a user session.");
738                 return -EINVAL;
739         }
740
741         if (arg_directory && arg_image) {
742                 log_error("--directory= and --image= may not be combined.");
743                 return -EINVAL;
744         }
745
746         if (arg_template && arg_image) {
747                 log_error("--template= and --image= may not be combined.");
748                 return -EINVAL;
749         }
750
751         if (arg_template && !(arg_directory || arg_machine)) {
752                 log_error("--template= needs --directory= or --machine=.");
753                 return -EINVAL;
754         }
755
756         if (arg_ephemeral && arg_template) {
757                 log_error("--ephemeral and --template= may not be combined.");
758                 return -EINVAL;
759         }
760
761         if (arg_ephemeral && arg_image) {
762                 log_error("--ephemeral and --image= may not be combined.");
763                 return -EINVAL;
764         }
765
766         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
767                 log_error("--ephemeral and --link-journal= may not be combined.");
768                 return -EINVAL;
769         }
770
771         if (arg_volatile != VOLATILE_NO && arg_read_only) {
772                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
773                 return -EINVAL;
774         }
775
776         if (arg_expose_ports && !arg_private_network) {
777                 log_error("Cannot use --port= without private networking.");
778                 return -EINVAL;
779         }
780
781         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
782
783         return 1;
784 }
785
786 static int mount_all(const char *dest) {
787
788         typedef struct MountPoint {
789                 const char *what;
790                 const char *where;
791                 const char *type;
792                 const char *options;
793                 unsigned long flags;
794                 bool fatal;
795         } MountPoint;
796
797         static const MountPoint mount_table[] = {
798                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
799                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
800                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
801                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
802                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
803                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
804                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
805                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
806 #ifdef HAVE_SELINUX
807                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
808                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
809 #endif
810         };
811
812         unsigned k;
813         int r = 0;
814
815         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
816                 _cleanup_free_ char *where = NULL;
817 #ifdef HAVE_SELINUX
818                 _cleanup_free_ char *options = NULL;
819 #endif
820                 const char *o;
821                 int t;
822
823                 where = strjoin(dest, "/", mount_table[k].where, NULL);
824                 if (!where)
825                         return log_oom();
826
827                 t = path_is_mount_point(where, true);
828                 if (t < 0) {
829                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
830
831                         if (r == 0)
832                                 r = t;
833
834                         continue;
835                 }
836
837                 /* Skip this entry if it is not a remount. */
838                 if (mount_table[k].what && t > 0)
839                         continue;
840
841                 t = mkdir_p(where, 0755);
842                 if (t < 0) {
843                         if (mount_table[k].fatal) {
844                                log_error_errno(t, "Failed to create directory %s: %m", where);
845
846                                 if (r == 0)
847                                         r = t;
848                         } else
849                                log_warning_errno(t, "Failed to create directory %s: %m", where);
850
851                         continue;
852                 }
853
854 #ifdef HAVE_SELINUX
855                 if (arg_selinux_apifs_context &&
856                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
857                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
858                         if (!options)
859                                 return log_oom();
860
861                         o = options;
862                 } else
863 #endif
864                         o = mount_table[k].options;
865
866
867                 if (mount(mount_table[k].what,
868                           where,
869                           mount_table[k].type,
870                           mount_table[k].flags,
871                           o) < 0) {
872
873                         if (mount_table[k].fatal) {
874                                 log_error_errno(errno, "mount(%s) failed: %m", where);
875
876                                 if (r == 0)
877                                         r = -errno;
878                         } else
879                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
880                 }
881         }
882
883         return r;
884 }
885
886 static int mount_binds(const char *dest, char **l, bool ro) {
887         char **x, **y;
888
889         STRV_FOREACH_PAIR(x, y, l) {
890                 _cleanup_free_ char *where = NULL;
891                 struct stat source_st, dest_st;
892                 int r;
893
894                 if (stat(*x, &source_st) < 0)
895                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
896
897                 where = strappend(dest, *y);
898                 if (!where)
899                         return log_oom();
900
901                 r = stat(where, &dest_st);
902                 if (r == 0) {
903                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
904                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
905                                 return -EINVAL;
906                         }
907                 } else if (errno == ENOENT) {
908                         r = mkdir_parents_label(where, 0755);
909                         if (r < 0)
910                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
911                 } else {
912                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
913                         return -errno;
914                 }
915
916                 /* Create the mount point, but be conservative -- refuse to create block
917                  * and char devices. */
918                 if (S_ISDIR(source_st.st_mode)) {
919                         r = mkdir_label(where, 0755);
920                         if (r < 0 && errno != EEXIST)
921                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
922                 } else if (S_ISFIFO(source_st.st_mode)) {
923                         r = mkfifo(where, 0644);
924                         if (r < 0 && errno != EEXIST)
925                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
926                 } else if (S_ISSOCK(source_st.st_mode)) {
927                         r = mknod(where, 0644 | S_IFSOCK, 0);
928                         if (r < 0 && errno != EEXIST)
929                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
930                 } else if (S_ISREG(source_st.st_mode)) {
931                         r = touch(where);
932                         if (r < 0)
933                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
934                 } else {
935                         log_error("Refusing to create mountpoint for file: %s", *x);
936                         return -ENOTSUP;
937                 }
938
939                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
940                         return log_error_errno(errno, "mount(%s) failed: %m", where);
941
942                 if (ro) {
943                         r = bind_remount_recursive(where, true);
944                         if (r < 0)
945                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
946                 }
947         }
948
949         return 0;
950 }
951
952 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
953         char *to;
954         int r;
955
956         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
957
958         r = path_is_mount_point(to, false);
959         if (r < 0)
960                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
961         if (r > 0)
962                 return 0;
963
964         mkdir_p(to, 0755);
965
966         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
967                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
968
969         return 1;
970 }
971
972 static int mount_cgroup(const char *dest) {
973         _cleanup_set_free_free_ Set *controllers = NULL;
974         _cleanup_free_ char *own_cgroup_path = NULL;
975         const char *cgroup_root, *systemd_root, *systemd_own;
976         int r;
977
978         controllers = set_new(&string_hash_ops);
979         if (!controllers)
980                 return log_oom();
981
982         r = cg_kernel_controllers(controllers);
983         if (r < 0)
984                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
985
986         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
987         if (r < 0)
988                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
989
990         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
991         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
992                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
993
994         for (;;) {
995                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
996
997                 controller = set_steal_first(controllers);
998                 if (!controller)
999                         break;
1000
1001                 origin = strappend("/sys/fs/cgroup/", controller);
1002                 if (!origin)
1003                         return log_oom();
1004
1005                 r = readlink_malloc(origin, &combined);
1006                 if (r == -EINVAL) {
1007                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1008
1009                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1010                         if (r < 0)
1011                                 return r;
1012
1013                 } else if (r < 0)
1014                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1015                 else {
1016                         _cleanup_free_ char *target = NULL;
1017
1018                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1019                         if (!target)
1020                                 return log_oom();
1021
1022                         /* A symbolic link, a combination of controllers in one hierarchy */
1023
1024                         if (!filename_is_valid(combined)) {
1025                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1026                                 continue;
1027                         }
1028
1029                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1030                         if (r < 0)
1031                                 return r;
1032
1033                         if (symlink(combined, target) < 0)
1034                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1035                 }
1036         }
1037
1038         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1039         if (r < 0)
1040                 return r;
1041
1042         /* Make our own cgroup a (writable) bind mount */
1043         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1044         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1045                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1046
1047         /* And then remount the systemd cgroup root read-only */
1048         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1049         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1050                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1051
1052         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1053                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1054
1055         return 0;
1056 }
1057
1058 static int mount_tmpfs(const char *dest) {
1059         char **i, **o;
1060
1061         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1062                 _cleanup_free_ char *where = NULL;
1063                 int r;
1064
1065                 where = strappend(dest, *i);
1066                 if (!where)
1067                         return log_oom();
1068
1069                 r = mkdir_label(where, 0755);
1070                 if (r < 0 && r != -EEXIST)
1071                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1072
1073                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1074                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1075         }
1076
1077         return 0;
1078 }
1079
1080 static int setup_timezone(const char *dest) {
1081         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1082         char *z, *y;
1083         int r;
1084
1085         assert(dest);
1086
1087         /* Fix the timezone, if possible */
1088         r = readlink_malloc("/etc/localtime", &p);
1089         if (r < 0) {
1090                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1091                 return 0;
1092         }
1093
1094         z = path_startswith(p, "../usr/share/zoneinfo/");
1095         if (!z)
1096                 z = path_startswith(p, "/usr/share/zoneinfo/");
1097         if (!z) {
1098                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1099                 return 0;
1100         }
1101
1102         where = strappend(dest, "/etc/localtime");
1103         if (!where)
1104                 return log_oom();
1105
1106         r = readlink_malloc(where, &q);
1107         if (r >= 0) {
1108                 y = path_startswith(q, "../usr/share/zoneinfo/");
1109                 if (!y)
1110                         y = path_startswith(q, "/usr/share/zoneinfo/");
1111
1112                 /* Already pointing to the right place? Then do nothing .. */
1113                 if (y && streq(y, z))
1114                         return 0;
1115         }
1116
1117         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1118         if (!check)
1119                 return log_oom();
1120
1121         if (access(check, F_OK) < 0) {
1122                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1123                 return 0;
1124         }
1125
1126         what = strappend("../usr/share/zoneinfo/", z);
1127         if (!what)
1128                 return log_oom();
1129
1130         r = mkdir_parents(where, 0755);
1131         if (r < 0) {
1132                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1133
1134                 return 0;
1135         }
1136
1137         r = unlink(where);
1138         if (r < 0 && errno != ENOENT) {
1139                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1140
1141                 return 0;
1142         }
1143
1144         if (symlink(what, where) < 0) {
1145                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1146                 return 0;
1147         }
1148
1149         return 0;
1150 }
1151
1152 static int setup_resolv_conf(const char *dest) {
1153         _cleanup_free_ char *where = NULL;
1154         int r;
1155
1156         assert(dest);
1157
1158         if (arg_private_network)
1159                 return 0;
1160
1161         /* Fix resolv.conf, if possible */
1162         where = strappend(dest, "/etc/resolv.conf");
1163         if (!where)
1164                 return log_oom();
1165
1166         /* We don't really care for the results of this really. If it
1167          * fails, it fails, but meh... */
1168         r = mkdir_parents(where, 0755);
1169         if (r < 0) {
1170                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1171
1172                 return 0;
1173         }
1174
1175         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1176         if (r < 0) {
1177                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1178
1179                 return 0;
1180         }
1181
1182         return 0;
1183 }
1184
1185 static int setup_volatile_state(const char *directory) {
1186         const char *p;
1187         int r;
1188
1189         assert(directory);
1190
1191         if (arg_volatile != VOLATILE_STATE)
1192                 return 0;
1193
1194         /* --volatile=state means we simply overmount /var
1195            with a tmpfs, and the rest read-only. */
1196
1197         r = bind_remount_recursive(directory, true);
1198         if (r < 0)
1199                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1200
1201         p = strappenda(directory, "/var");
1202         r = mkdir(p, 0755);
1203         if (r < 0 && errno != EEXIST)
1204                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1205
1206         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1207                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1208
1209         return 0;
1210 }
1211
1212 static int setup_volatile(const char *directory) {
1213         bool tmpfs_mounted = false, bind_mounted = false;
1214         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1215         const char *f, *t;
1216         int r;
1217
1218         assert(directory);
1219
1220         if (arg_volatile != VOLATILE_YES)
1221                 return 0;
1222
1223         /* --volatile=yes means we mount a tmpfs to the root dir, and
1224            the original /usr to use inside it, and that read-only. */
1225
1226         if (!mkdtemp(template))
1227                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1228
1229         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1230                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1231                 r = -errno;
1232                 goto fail;
1233         }
1234
1235         tmpfs_mounted = true;
1236
1237         f = strappenda(directory, "/usr");
1238         t = strappenda(template, "/usr");
1239
1240         r = mkdir(t, 0755);
1241         if (r < 0 && errno != EEXIST) {
1242                 log_error_errno(errno, "Failed to create %s: %m", t);
1243                 r = -errno;
1244                 goto fail;
1245         }
1246
1247         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1248                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1249                 r = -errno;
1250                 goto fail;
1251         }
1252
1253         bind_mounted = true;
1254
1255         r = bind_remount_recursive(t, true);
1256         if (r < 0) {
1257                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1258                 goto fail;
1259         }
1260
1261         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1262                 log_error_errno(errno, "Failed to move root mount: %m");
1263                 r = -errno;
1264                 goto fail;
1265         }
1266
1267         rmdir(template);
1268
1269         return 0;
1270
1271 fail:
1272         if (bind_mounted)
1273                 umount(t);
1274         if (tmpfs_mounted)
1275                 umount(template);
1276         rmdir(template);
1277         return r;
1278 }
1279
1280 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1281
1282         snprintf(s, 37,
1283                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1284                  SD_ID128_FORMAT_VAL(id));
1285
1286         return s;
1287 }
1288
1289 static int setup_boot_id(const char *dest) {
1290         _cleanup_free_ char *from = NULL, *to = NULL;
1291         sd_id128_t rnd = {};
1292         char as_uuid[37];
1293         int r;
1294
1295         assert(dest);
1296
1297         if (arg_share_system)
1298                 return 0;
1299
1300         /* Generate a new randomized boot ID, so that each boot-up of
1301          * the container gets a new one */
1302
1303         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1304         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1305         if (!from || !to)
1306                 return log_oom();
1307
1308         r = sd_id128_randomize(&rnd);
1309         if (r < 0)
1310                 return log_error_errno(r, "Failed to generate random boot id: %m");
1311
1312         id128_format_as_uuid(rnd, as_uuid);
1313
1314         r = write_string_file(from, as_uuid);
1315         if (r < 0)
1316                 return log_error_errno(r, "Failed to write boot id: %m");
1317
1318         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1319                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1320                 r = -errno;
1321         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1322                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1323
1324         unlink(from);
1325         return r;
1326 }
1327
1328 static int copy_devnodes(const char *dest) {
1329
1330         static const char devnodes[] =
1331                 "null\0"
1332                 "zero\0"
1333                 "full\0"
1334                 "random\0"
1335                 "urandom\0"
1336                 "tty\0"
1337                 "net/tun\0";
1338
1339         const char *d;
1340         int r = 0;
1341         _cleanup_umask_ mode_t u;
1342
1343         assert(dest);
1344
1345         u = umask(0000);
1346
1347         NULSTR_FOREACH(d, devnodes) {
1348                 _cleanup_free_ char *from = NULL, *to = NULL;
1349                 struct stat st;
1350
1351                 from = strappend("/dev/", d);
1352                 to = strjoin(dest, "/dev/", d, NULL);
1353                 if (!from || !to)
1354                         return log_oom();
1355
1356                 if (stat(from, &st) < 0) {
1357
1358                         if (errno != ENOENT)
1359                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1360
1361                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1362
1363                         log_error("%s is not a char or block device, cannot copy", from);
1364                         return -EIO;
1365
1366                 } else {
1367                         r = mkdir_parents(to, 0775);
1368                         if (r < 0) {
1369                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1370                                 return -r;
1371                         }
1372
1373                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1374                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1375                 }
1376         }
1377
1378         return r;
1379 }
1380
1381 static int setup_ptmx(const char *dest) {
1382         _cleanup_free_ char *p = NULL;
1383
1384         p = strappend(dest, "/dev/ptmx");
1385         if (!p)
1386                 return log_oom();
1387
1388         if (symlink("pts/ptmx", p) < 0)
1389                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1390
1391         return 0;
1392 }
1393
1394 static int setup_dev_console(const char *dest, const char *console) {
1395         _cleanup_umask_ mode_t u;
1396         const char *to;
1397         struct stat st;
1398         int r;
1399
1400         assert(dest);
1401         assert(console);
1402
1403         u = umask(0000);
1404
1405         if (stat("/dev/null", &st) < 0)
1406                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1407
1408         r = chmod_and_chown(console, 0600, 0, 0);
1409         if (r < 0)
1410                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1411
1412         /* We need to bind mount the right tty to /dev/console since
1413          * ptys can only exist on pts file systems. To have something
1414          * to bind mount things on we create a device node first, and
1415          * use /dev/null for that since we the cgroups device policy
1416          * allows us to create that freely, while we cannot create
1417          * /dev/console. (Note that the major minor doesn't actually
1418          * matter here, since we mount it over anyway). */
1419
1420         to = strappenda(dest, "/dev/console");
1421         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1422                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1423
1424         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1425                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1426
1427         return 0;
1428 }
1429
1430 static int setup_kmsg(const char *dest, int kmsg_socket) {
1431         _cleanup_free_ char *from = NULL, *to = NULL;
1432         _cleanup_umask_ mode_t u;
1433         int r, fd, k;
1434         union {
1435                 struct cmsghdr cmsghdr;
1436                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1437         } control = {};
1438         struct msghdr mh = {
1439                 .msg_control = &control,
1440                 .msg_controllen = sizeof(control),
1441         };
1442         struct cmsghdr *cmsg;
1443
1444         assert(dest);
1445         assert(kmsg_socket >= 0);
1446
1447         u = umask(0000);
1448
1449         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1450          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1451          * on the reading side behave very similar to /proc/kmsg,
1452          * their writing side behaves differently from /dev/kmsg in
1453          * that writing blocks when nothing is reading. In order to
1454          * avoid any problems with containers deadlocking due to this
1455          * we simply make /dev/kmsg unavailable to the container. */
1456         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1457             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1458                 return log_oom();
1459
1460         if (mkfifo(from, 0600) < 0)
1461                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1462
1463         r = chmod_and_chown(from, 0600, 0, 0);
1464         if (r < 0)
1465                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1466
1467         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1468                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1469
1470         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1471         if (fd < 0)
1472                 return log_error_errno(errno, "Failed to open fifo: %m");
1473
1474         cmsg = CMSG_FIRSTHDR(&mh);
1475         cmsg->cmsg_level = SOL_SOCKET;
1476         cmsg->cmsg_type = SCM_RIGHTS;
1477         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1478         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1479
1480         mh.msg_controllen = cmsg->cmsg_len;
1481
1482         /* Store away the fd in the socket, so that it stays open as
1483          * long as we run the child */
1484         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1485         safe_close(fd);
1486
1487         if (k < 0)
1488                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1489
1490         /* And now make the FIFO unavailable as /dev/kmsg... */
1491         unlink(from);
1492         return 0;
1493 }
1494
1495 static int send_rtnl(int send_fd) {
1496         union {
1497                 struct cmsghdr cmsghdr;
1498                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1499         } control = {};
1500         struct msghdr mh = {
1501                 .msg_control = &control,
1502                 .msg_controllen = sizeof(control),
1503         };
1504         struct cmsghdr *cmsg;
1505         _cleanup_close_ int fd = -1;
1506         ssize_t k;
1507
1508         assert(send_fd >= 0);
1509
1510         if (!arg_expose_ports)
1511                 return 0;
1512
1513         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1514         if (fd < 0)
1515                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1516
1517         cmsg = CMSG_FIRSTHDR(&mh);
1518         cmsg->cmsg_level = SOL_SOCKET;
1519         cmsg->cmsg_type = SCM_RIGHTS;
1520         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1521         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1522
1523         mh.msg_controllen = cmsg->cmsg_len;
1524
1525         /* Store away the fd in the socket, so that it stays open as
1526          * long as we run the child */
1527         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1528         if (k < 0)
1529                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1530
1531         return 0;
1532 }
1533
1534 static int flush_ports(union in_addr_union *exposed) {
1535         ExposePort *p;
1536         int r, af = AF_INET;
1537
1538         assert(exposed);
1539
1540         if (!arg_expose_ports)
1541                 return 0;
1542
1543         if (in_addr_is_null(af, exposed))
1544                 return 0;
1545
1546         log_debug("Lost IP address.");
1547
1548         LIST_FOREACH(ports, p, arg_expose_ports) {
1549                 r = fw_add_local_dnat(false,
1550                                       af,
1551                                       p->protocol,
1552                                       NULL,
1553                                       NULL, 0,
1554                                       NULL, 0,
1555                                       p->host_port,
1556                                       exposed,
1557                                       p->container_port,
1558                                       NULL);
1559                 if (r < 0)
1560                         log_warning_errno(r, "Failed to modify firewall: %m");
1561         }
1562
1563         *exposed = IN_ADDR_NULL;
1564         return 0;
1565 }
1566
1567 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1568         _cleanup_free_ struct local_address *addresses = NULL;
1569         _cleanup_free_ char *pretty = NULL;
1570         union in_addr_union new_exposed;
1571         ExposePort *p;
1572         bool add;
1573         int af = AF_INET, r;
1574
1575         assert(exposed);
1576
1577         /* Invoked each time an address is added or removed inside the
1578          * container */
1579
1580         if (!arg_expose_ports)
1581                 return 0;
1582
1583         r = local_addresses(rtnl, 0, af, &addresses);
1584         if (r < 0)
1585                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1586
1587         add = r > 0 &&
1588                 addresses[0].family == af &&
1589                 addresses[0].scope < RT_SCOPE_LINK;
1590
1591         if (!add)
1592                 return flush_ports(exposed);
1593
1594         new_exposed = addresses[0].address;
1595         if (in_addr_equal(af, exposed, &new_exposed))
1596                 return 0;
1597
1598         in_addr_to_string(af, &new_exposed, &pretty);
1599         log_debug("New container IP is %s.", strna(pretty));
1600
1601         LIST_FOREACH(ports, p, arg_expose_ports) {
1602
1603                 r = fw_add_local_dnat(true,
1604                                       af,
1605                                       p->protocol,
1606                                       NULL,
1607                                       NULL, 0,
1608                                       NULL, 0,
1609                                       p->host_port,
1610                                       &new_exposed,
1611                                       p->container_port,
1612                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1613                 if (r < 0)
1614                         log_warning_errno(r, "Failed to modify firewall: %m");
1615         }
1616
1617         *exposed = new_exposed;
1618         return 0;
1619 }
1620
1621 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1622         union in_addr_union *exposed = userdata;
1623
1624         assert(rtnl);
1625         assert(m);
1626         assert(exposed);
1627
1628         expose_ports(rtnl, exposed);
1629         return 0;
1630 }
1631
1632 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1633         union {
1634                 struct cmsghdr cmsghdr;
1635                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1636         } control = {};
1637         struct msghdr mh = {
1638                 .msg_control = &control,
1639                 .msg_controllen = sizeof(control),
1640         };
1641         struct cmsghdr *cmsg;
1642         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1643         int fd, r;
1644         ssize_t k;
1645
1646         assert(event);
1647         assert(recv_fd >= 0);
1648         assert(ret);
1649
1650         if (!arg_expose_ports)
1651                 return 0;
1652
1653         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1654         if (k < 0)
1655                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1656
1657         cmsg = CMSG_FIRSTHDR(&mh);
1658         assert(cmsg->cmsg_level == SOL_SOCKET);
1659         assert(cmsg->cmsg_type == SCM_RIGHTS);
1660         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1661         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1662
1663         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1664         if (r < 0) {
1665                 safe_close(fd);
1666                 return log_error_errno(r, "Failed to create rtnl object: %m");
1667         }
1668
1669         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1670         if (r < 0)
1671                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1672
1673         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1674         if (r < 0)
1675                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1676
1677         r = sd_rtnl_attach_event(rtnl, event, 0);
1678         if (r < 0)
1679                 return log_error_errno(r, "Failed to add to even loop: %m");
1680
1681         *ret = rtnl;
1682         rtnl = NULL;
1683
1684         return 0;
1685 }
1686
1687 static int setup_hostname(void) {
1688
1689         if (arg_share_system)
1690                 return 0;
1691
1692         if (sethostname_idempotent(arg_machine) < 0)
1693                 return -errno;
1694
1695         return 0;
1696 }
1697
1698 static int setup_journal(const char *directory) {
1699         sd_id128_t machine_id, this_id;
1700         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1701         char *id;
1702         int r;
1703
1704         /* Don't link journals in ephemeral mode */
1705         if (arg_ephemeral)
1706                 return 0;
1707
1708         p = strappend(directory, "/etc/machine-id");
1709         if (!p)
1710                 return log_oom();
1711
1712         r = read_one_line_file(p, &b);
1713         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1714                 return 0;
1715         else if (r < 0)
1716                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1717
1718         id = strstrip(b);
1719         if (isempty(id) && arg_link_journal == LINK_AUTO)
1720                 return 0;
1721
1722         /* Verify validity */
1723         r = sd_id128_from_string(id, &machine_id);
1724         if (r < 0)
1725                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1726
1727         r = sd_id128_get_machine(&this_id);
1728         if (r < 0)
1729                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1730
1731         if (sd_id128_equal(machine_id, this_id)) {
1732                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1733                          "Host and machine ids are equal (%s): refusing to link journals", id);
1734                 if (arg_link_journal == LINK_AUTO)
1735                         return 0;
1736                 return -EEXIST;
1737         }
1738
1739         if (arg_link_journal == LINK_NO)
1740                 return 0;
1741
1742         free(p);
1743         p = strappend("/var/log/journal/", id);
1744         q = strjoin(directory, "/var/log/journal/", id, NULL);
1745         if (!p || !q)
1746                 return log_oom();
1747
1748         if (path_is_mount_point(p, false) > 0) {
1749                 if (arg_link_journal != LINK_AUTO) {
1750                         log_error("%s: already a mount point, refusing to use for journal", p);
1751                         return -EEXIST;
1752                 }
1753
1754                 return 0;
1755         }
1756
1757         if (path_is_mount_point(q, false) > 0) {
1758                 if (arg_link_journal != LINK_AUTO) {
1759                         log_error("%s: already a mount point, refusing to use for journal", q);
1760                         return -EEXIST;
1761                 }
1762
1763                 return 0;
1764         }
1765
1766         r = readlink_and_make_absolute(p, &d);
1767         if (r >= 0) {
1768                 if ((arg_link_journal == LINK_GUEST ||
1769                      arg_link_journal == LINK_AUTO) &&
1770                     path_equal(d, q)) {
1771
1772                         r = mkdir_p(q, 0755);
1773                         if (r < 0)
1774                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1775                         return 0;
1776                 }
1777
1778                 if (unlink(p) < 0)
1779                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1780         } else if (r == -EINVAL) {
1781
1782                 if (arg_link_journal == LINK_GUEST &&
1783                     rmdir(p) < 0) {
1784
1785                         if (errno == ENOTDIR) {
1786                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1787                                 return r;
1788                         } else {
1789                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1790                                 return -errno;
1791                         }
1792                 }
1793         } else if (r != -ENOENT) {
1794                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1795                 return r;
1796         }
1797
1798         if (arg_link_journal == LINK_GUEST) {
1799
1800                 if (symlink(q, p) < 0) {
1801                         if (arg_link_journal_try) {
1802                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1803                                 return 0;
1804                         } else {
1805                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1806                                 return -errno;
1807                         }
1808                 }
1809
1810                 r = mkdir_p(q, 0755);
1811                 if (r < 0)
1812                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1813                 return 0;
1814         }
1815
1816         if (arg_link_journal == LINK_HOST) {
1817                 /* don't create parents here -- if the host doesn't have
1818                  * permanent journal set up, don't force it here */
1819                 r = mkdir(p, 0755);
1820                 if (r < 0) {
1821                         if (arg_link_journal_try) {
1822                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1823                                 return 0;
1824                         } else {
1825                                 log_error_errno(errno, "Failed to create %s: %m", p);
1826                                 return r;
1827                         }
1828                 }
1829
1830         } else if (access(p, F_OK) < 0)
1831                 return 0;
1832
1833         if (dir_is_empty(q) == 0)
1834                 log_warning("%s is not empty, proceeding anyway.", q);
1835
1836         r = mkdir_p(q, 0755);
1837         if (r < 0) {
1838                 log_error_errno(errno, "Failed to create %s: %m", q);
1839                 return r;
1840         }
1841
1842         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1843                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1844
1845         return 0;
1846 }
1847
1848 static int drop_capabilities(void) {
1849         return capability_bounding_set_drop(~arg_retain, false);
1850 }
1851
1852 static int register_machine(pid_t pid, int local_ifindex) {
1853         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1854         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1855         int r;
1856
1857         if (!arg_register)
1858                 return 0;
1859
1860         r = sd_bus_default_system(&bus);
1861         if (r < 0)
1862                 return log_error_errno(r, "Failed to open system bus: %m");
1863
1864         if (arg_keep_unit) {
1865                 r = sd_bus_call_method(
1866                                 bus,
1867                                 "org.freedesktop.machine1",
1868                                 "/org/freedesktop/machine1",
1869                                 "org.freedesktop.machine1.Manager",
1870                                 "RegisterMachineWithNetwork",
1871                                 &error,
1872                                 NULL,
1873                                 "sayssusai",
1874                                 arg_machine,
1875                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1876                                 "nspawn",
1877                                 "container",
1878                                 (uint32_t) pid,
1879                                 strempty(arg_directory),
1880                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1881         } else {
1882                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1883
1884                 r = sd_bus_message_new_method_call(
1885                                 bus,
1886                                 &m,
1887                                 "org.freedesktop.machine1",
1888                                 "/org/freedesktop/machine1",
1889                                 "org.freedesktop.machine1.Manager",
1890                                 "CreateMachineWithNetwork");
1891                 if (r < 0)
1892                         return log_error_errno(r, "Failed to create message: %m");
1893
1894                 r = sd_bus_message_append(
1895                                 m,
1896                                 "sayssusai",
1897                                 arg_machine,
1898                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1899                                 "nspawn",
1900                                 "container",
1901                                 (uint32_t) pid,
1902                                 strempty(arg_directory),
1903                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1904                 if (r < 0)
1905                         return log_error_errno(r, "Failed to append message arguments: %m");
1906
1907                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1908                 if (r < 0)
1909                         return log_error_errno(r, "Failed to open container: %m");
1910
1911                 if (!isempty(arg_slice)) {
1912                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1913                         if (r < 0)
1914                                 return log_error_errno(r, "Failed to append slice: %m");
1915                 }
1916
1917                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1918                 if (r < 0)
1919                         return log_error_errno(r, "Failed to add device policy: %m");
1920
1921                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1922                                           /* Allow the container to
1923                                            * access and create the API
1924                                            * device nodes, so that
1925                                            * PrivateDevices= in the
1926                                            * container can work
1927                                            * fine */
1928                                           "/dev/null", "rwm",
1929                                           "/dev/zero", "rwm",
1930                                           "/dev/full", "rwm",
1931                                           "/dev/random", "rwm",
1932                                           "/dev/urandom", "rwm",
1933                                           "/dev/tty", "rwm",
1934                                           "/dev/net/tun", "rwm",
1935                                           /* Allow the container
1936                                            * access to ptys. However,
1937                                            * do not permit the
1938                                            * container to ever create
1939                                            * these device nodes. */
1940                                           "/dev/pts/ptmx", "rw",
1941                                           "char-pts", "rw");
1942                 if (r < 0)
1943                         return log_error_errno(r, "Failed to add device whitelist: %m");
1944
1945                 r = sd_bus_message_close_container(m);
1946                 if (r < 0)
1947                         return log_error_errno(r, "Failed to close container: %m");
1948
1949                 r = sd_bus_call(bus, m, 0, &error, NULL);
1950         }
1951
1952         if (r < 0) {
1953                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1954                 return r;
1955         }
1956
1957         return 0;
1958 }
1959
1960 static int terminate_machine(pid_t pid) {
1961         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1962         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1963         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1964         const char *path;
1965         int r;
1966
1967         if (!arg_register)
1968                 return 0;
1969
1970         r = sd_bus_default_system(&bus);
1971         if (r < 0)
1972                 return log_error_errno(r, "Failed to open system bus: %m");
1973
1974         r = sd_bus_call_method(
1975                         bus,
1976                         "org.freedesktop.machine1",
1977                         "/org/freedesktop/machine1",
1978                         "org.freedesktop.machine1.Manager",
1979                         "GetMachineByPID",
1980                         &error,
1981                         &reply,
1982                         "u",
1983                         (uint32_t) pid);
1984         if (r < 0) {
1985                 /* Note that the machine might already have been
1986                  * cleaned up automatically, hence don't consider it a
1987                  * failure if we cannot get the machine object. */
1988                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1989                 return 0;
1990         }
1991
1992         r = sd_bus_message_read(reply, "o", &path);
1993         if (r < 0)
1994                 return bus_log_parse_error(r);
1995
1996         r = sd_bus_call_method(
1997                         bus,
1998                         "org.freedesktop.machine1",
1999                         path,
2000                         "org.freedesktop.machine1.Machine",
2001                         "Terminate",
2002                         &error,
2003                         NULL,
2004                         NULL);
2005         if (r < 0) {
2006                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2007                 return 0;
2008         }
2009
2010         return 0;
2011 }
2012
2013 static int reset_audit_loginuid(void) {
2014         _cleanup_free_ char *p = NULL;
2015         int r;
2016
2017         if (arg_share_system)
2018                 return 0;
2019
2020         r = read_one_line_file("/proc/self/loginuid", &p);
2021         if (r == -ENOENT)
2022                 return 0;
2023         if (r < 0)
2024                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2025
2026         /* Already reset? */
2027         if (streq(p, "4294967295"))
2028                 return 0;
2029
2030         r = write_string_file("/proc/self/loginuid", "4294967295");
2031         if (r < 0) {
2032                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2033                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2034                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2035                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2036                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2037
2038                 sleep(5);
2039         }
2040
2041         return 0;
2042 }
2043
2044 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2045 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2046 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2047
2048 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2049         uint8_t result[8];
2050         size_t l, sz;
2051         uint8_t *v, *i;
2052         int r;
2053
2054         l = strlen(arg_machine);
2055         sz = sizeof(sd_id128_t) + l;
2056         if (idx > 0)
2057                 sz += sizeof(idx);
2058
2059         v = alloca(sz);
2060
2061         /* fetch some persistent data unique to the host */
2062         r = sd_id128_get_machine((sd_id128_t*) v);
2063         if (r < 0)
2064                 return r;
2065
2066         /* combine with some data unique (on this host) to this
2067          * container instance */
2068         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2069         if (idx > 0) {
2070                 idx = htole64(idx);
2071                 memcpy(i, &idx, sizeof(idx));
2072         }
2073
2074         /* Let's hash the host machine ID plus the container name. We
2075          * use a fixed, but originally randomly created hash key here. */
2076         siphash24(result, v, sz, hash_key.bytes);
2077
2078         assert_cc(ETH_ALEN <= sizeof(result));
2079         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2080
2081         /* see eth_random_addr in the kernel */
2082         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2083         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2084
2085         return 0;
2086 }
2087
2088 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2089         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2090         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2091         struct ether_addr mac_host, mac_container;
2092         int r, i;
2093
2094         if (!arg_private_network)
2095                 return 0;
2096
2097         if (!arg_network_veth)
2098                 return 0;
2099
2100         /* Use two different interface name prefixes depending whether
2101          * we are in bridge mode or not. */
2102         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2103                  arg_network_bridge ? "vb" : "ve", arg_machine);
2104
2105         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2106         if (r < 0)
2107                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2108
2109         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2110         if (r < 0)
2111                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2112
2113         r = sd_rtnl_open(&rtnl, 0);
2114         if (r < 0)
2115                 return log_error_errno(r, "Failed to connect to netlink: %m");
2116
2117         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2118         if (r < 0)
2119                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2120
2121         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2122         if (r < 0)
2123                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2124
2125         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2126         if (r < 0)
2127                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2128
2129         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2130         if (r < 0)
2131                 return log_error_errno(r, "Failed to open netlink container: %m");
2132
2133         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2134         if (r < 0)
2135                 return log_error_errno(r, "Failed to open netlink container: %m");
2136
2137         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2138         if (r < 0)
2139                 return log_error_errno(r, "Failed to open netlink container: %m");
2140
2141         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2142         if (r < 0)
2143                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2144
2145         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2146         if (r < 0)
2147                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2148
2149         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2150         if (r < 0)
2151                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2152
2153         r = sd_rtnl_message_close_container(m);
2154         if (r < 0)
2155                 return log_error_errno(r, "Failed to close netlink container: %m");
2156
2157         r = sd_rtnl_message_close_container(m);
2158         if (r < 0)
2159                 return log_error_errno(r, "Failed to close netlink container: %m");
2160
2161         r = sd_rtnl_message_close_container(m);
2162         if (r < 0)
2163                 return log_error_errno(r, "Failed to close netlink container: %m");
2164
2165         r = sd_rtnl_call(rtnl, m, 0, NULL);
2166         if (r < 0)
2167                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2168
2169         i = (int) if_nametoindex(iface_name);
2170         if (i <= 0)
2171                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2172
2173         *ifi = i;
2174
2175         return 0;
2176 }
2177
2178 static int setup_bridge(const char veth_name[], int *ifi) {
2179         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2180         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2181         int r, bridge;
2182
2183         if (!arg_private_network)
2184                 return 0;
2185
2186         if (!arg_network_veth)
2187                 return 0;
2188
2189         if (!arg_network_bridge)
2190                 return 0;
2191
2192         bridge = (int) if_nametoindex(arg_network_bridge);
2193         if (bridge <= 0)
2194                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2195
2196         *ifi = bridge;
2197
2198         r = sd_rtnl_open(&rtnl, 0);
2199         if (r < 0)
2200                 return log_error_errno(r, "Failed to connect to netlink: %m");
2201
2202         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2203         if (r < 0)
2204                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2205
2206         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2207         if (r < 0)
2208                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2209
2210         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2211         if (r < 0)
2212                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2213
2214         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2215         if (r < 0)
2216                 return log_error_errno(r, "Failed to add netlink master field: %m");
2217
2218         r = sd_rtnl_call(rtnl, m, 0, NULL);
2219         if (r < 0)
2220                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2221
2222         return 0;
2223 }
2224
2225 static int parse_interface(struct udev *udev, const char *name) {
2226         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2227         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2228         int ifi;
2229
2230         ifi = (int) if_nametoindex(name);
2231         if (ifi <= 0)
2232                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2233
2234         sprintf(ifi_str, "n%i", ifi);
2235         d = udev_device_new_from_device_id(udev, ifi_str);
2236         if (!d)
2237                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2238
2239         if (udev_device_get_is_initialized(d) <= 0) {
2240                 log_error("Network interface %s is not initialized yet.", name);
2241                 return -EBUSY;
2242         }
2243
2244         return ifi;
2245 }
2246
2247 static int move_network_interfaces(pid_t pid) {
2248         _cleanup_udev_unref_ struct udev *udev = NULL;
2249         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2250         char **i;
2251         int r;
2252
2253         if (!arg_private_network)
2254                 return 0;
2255
2256         if (strv_isempty(arg_network_interfaces))
2257                 return 0;
2258
2259         r = sd_rtnl_open(&rtnl, 0);
2260         if (r < 0)
2261                 return log_error_errno(r, "Failed to connect to netlink: %m");
2262
2263         udev = udev_new();
2264         if (!udev) {
2265                 log_error("Failed to connect to udev.");
2266                 return -ENOMEM;
2267         }
2268
2269         STRV_FOREACH(i, arg_network_interfaces) {
2270                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2271                 int ifi;
2272
2273                 ifi = parse_interface(udev, *i);
2274                 if (ifi < 0)
2275                         return ifi;
2276
2277                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2278                 if (r < 0)
2279                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2280
2281                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2282                 if (r < 0)
2283                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2284
2285                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2286                 if (r < 0)
2287                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2288         }
2289
2290         return 0;
2291 }
2292
2293 static int setup_macvlan(pid_t pid) {
2294         _cleanup_udev_unref_ struct udev *udev = NULL;
2295         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2296         unsigned idx = 0;
2297         char **i;
2298         int r;
2299
2300         if (!arg_private_network)
2301                 return 0;
2302
2303         if (strv_isempty(arg_network_macvlan))
2304                 return 0;
2305
2306         r = sd_rtnl_open(&rtnl, 0);
2307         if (r < 0)
2308                 return log_error_errno(r, "Failed to connect to netlink: %m");
2309
2310         udev = udev_new();
2311         if (!udev) {
2312                 log_error("Failed to connect to udev.");
2313                 return -ENOMEM;
2314         }
2315
2316         STRV_FOREACH(i, arg_network_macvlan) {
2317                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2318                 _cleanup_free_ char *n = NULL;
2319                 struct ether_addr mac;
2320                 int ifi;
2321
2322                 ifi = parse_interface(udev, *i);
2323                 if (ifi < 0)
2324                         return ifi;
2325
2326                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2327                 if (r < 0)
2328                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2329
2330                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2331                 if (r < 0)
2332                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2333
2334                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2335                 if (r < 0)
2336                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2337
2338                 n = strappend("mv-", *i);
2339                 if (!n)
2340                         return log_oom();
2341
2342                 strshorten(n, IFNAMSIZ-1);
2343
2344                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2345                 if (r < 0)
2346                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2347
2348                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2349                 if (r < 0)
2350                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2351
2352                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2353                 if (r < 0)
2354                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2355
2356                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2357                 if (r < 0)
2358                         return log_error_errno(r, "Failed to open netlink container: %m");
2359
2360                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2361                 if (r < 0)
2362                         return log_error_errno(r, "Failed to open netlink container: %m");
2363
2364                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2365                 if (r < 0)
2366                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2367
2368                 r = sd_rtnl_message_close_container(m);
2369                 if (r < 0)
2370                         return log_error_errno(r, "Failed to close netlink container: %m");
2371
2372                 r = sd_rtnl_message_close_container(m);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to close netlink container: %m");
2375
2376                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2379         }
2380
2381         return 0;
2382 }
2383
2384 static int setup_seccomp(void) {
2385
2386 #ifdef HAVE_SECCOMP
2387         static const int blacklist[] = {
2388                 SCMP_SYS(kexec_load),
2389                 SCMP_SYS(open_by_handle_at),
2390                 SCMP_SYS(init_module),
2391                 SCMP_SYS(finit_module),
2392                 SCMP_SYS(delete_module),
2393                 SCMP_SYS(iopl),
2394                 SCMP_SYS(ioperm),
2395                 SCMP_SYS(swapon),
2396                 SCMP_SYS(swapoff),
2397         };
2398
2399         scmp_filter_ctx seccomp;
2400         unsigned i;
2401         int r;
2402
2403         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2404         if (!seccomp)
2405                 return log_oom();
2406
2407         r = seccomp_add_secondary_archs(seccomp);
2408         if (r < 0) {
2409                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2410                 goto finish;
2411         }
2412
2413         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2414                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2415                 if (r == -EFAULT)
2416                         continue; /* unknown syscall */
2417                 if (r < 0) {
2418                         log_error_errno(r, "Failed to block syscall: %m");
2419                         goto finish;
2420                 }
2421         }
2422
2423         /*
2424            Audit is broken in containers, much of the userspace audit
2425            hookup will fail if running inside a container. We don't
2426            care and just turn off creation of audit sockets.
2427
2428            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2429            with EAFNOSUPPORT which audit userspace uses as indication
2430            that audit is disabled in the kernel.
2431          */
2432
2433         r = seccomp_rule_add(
2434                         seccomp,
2435                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2436                         SCMP_SYS(socket),
2437                         2,
2438                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2439                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2440         if (r < 0) {
2441                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2442                 goto finish;
2443         }
2444
2445         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2446         if (r < 0) {
2447                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2448                 goto finish;
2449         }
2450
2451         r = seccomp_load(seccomp);
2452         if (r < 0)
2453                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2454
2455 finish:
2456         seccomp_release(seccomp);
2457         return r;
2458 #else
2459         return 0;
2460 #endif
2461
2462 }
2463
2464 static int setup_propagate(const char *root) {
2465         const char *p, *q;
2466
2467         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2468         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2469         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2470         (void) mkdir_p(p, 0600);
2471
2472         q = strappenda(root, "/run/systemd/nspawn/incoming");
2473         mkdir_parents(q, 0755);
2474         mkdir_p(q, 0600);
2475
2476         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2477                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2478
2479         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2480                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2481
2482         return 0;
2483 }
2484
2485 static int setup_image(char **device_path, int *loop_nr) {
2486         struct loop_info64 info = {
2487                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2488         };
2489         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2490         _cleanup_free_ char* loopdev = NULL;
2491         struct stat st;
2492         int r, nr;
2493
2494         assert(device_path);
2495         assert(loop_nr);
2496         assert(arg_image);
2497
2498         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2499         if (fd < 0)
2500                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2501
2502         if (fstat(fd, &st) < 0)
2503                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2504
2505         if (S_ISBLK(st.st_mode)) {
2506                 char *p;
2507
2508                 p = strdup(arg_image);
2509                 if (!p)
2510                         return log_oom();
2511
2512                 *device_path = p;
2513
2514                 *loop_nr = -1;
2515
2516                 r = fd;
2517                 fd = -1;
2518
2519                 return r;
2520         }
2521
2522         if (!S_ISREG(st.st_mode)) {
2523                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2524                 return -EINVAL;
2525         }
2526
2527         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2528         if (control < 0)
2529                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2530
2531         nr = ioctl(control, LOOP_CTL_GET_FREE);
2532         if (nr < 0)
2533                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2534
2535         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2536                 return log_oom();
2537
2538         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2539         if (loop < 0)
2540                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2541
2542         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2543                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2544
2545         if (arg_read_only)
2546                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2547
2548         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2549                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2550
2551         *device_path = loopdev;
2552         loopdev = NULL;
2553
2554         *loop_nr = nr;
2555
2556         r = loop;
2557         loop = -1;
2558
2559         return r;
2560 }
2561
2562 static int wait_for_block_device(struct udev *udev, dev_t devnum, struct udev_device **ret) {
2563         _cleanup_udev_monitor_unref_ struct udev_monitor *monitor = NULL;
2564         int r;
2565
2566         assert(udev);
2567         assert(ret);
2568
2569         for (;;) {
2570                 _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2571                 struct pollfd pfd = {
2572                         .events = POLLIN
2573                 };
2574
2575                 d = udev_device_new_from_devnum(udev, 'b', devnum);
2576                 if (!d)
2577                         return log_oom();
2578
2579                 r = udev_device_get_is_initialized(d);
2580                 if (r < 0)
2581                         return log_error_errno(r, "Failed to check if device is initialized: %m");
2582                 if (r > 0) {
2583                         *ret = d;
2584                         d = NULL;
2585                         return 0;
2586                 }
2587                 d = udev_device_unref(d);
2588
2589                 if (!monitor) {
2590                         monitor = udev_monitor_new_from_netlink(udev, "udev");
2591                         if (!monitor)
2592                                 return log_oom();
2593
2594                         r = udev_monitor_filter_add_match_subsystem_devtype(monitor, "block", NULL);
2595                         if (r < 0)
2596                                 return log_error_errno(r, "Failed to add block match: %m");
2597
2598                         r = udev_monitor_enable_receiving(monitor);
2599                         if (r < 0)
2600                                 return log_error_errno(r, "Failed to turn on monitor: %m");
2601
2602                         continue;
2603                 }
2604
2605                 pfd.fd = udev_monitor_get_fd(monitor);
2606                 if (pfd.fd < 0)
2607                         return log_error_errno(r, "Failed to get udev monitor fd: %m");
2608
2609                 r = poll(&pfd, 1, -1);
2610                 if (r < 0)
2611                         return log_error_errno(errno, "Failed to wait for device initialization: %m");
2612
2613                 d = udev_monitor_receive_device(monitor);
2614         }
2615
2616         return 0;
2617 }
2618
2619 #define PARTITION_TABLE_BLURB \
2620         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2621         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2622         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2623         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2624         "to be bootable with systemd-nspawn."
2625
2626 static int dissect_image(
2627                 int fd,
2628                 char **root_device, bool *root_device_rw,
2629                 char **home_device, bool *home_device_rw,
2630                 char **srv_device, bool *srv_device_rw,
2631                 bool *secondary) {
2632
2633 #ifdef HAVE_BLKID
2634         int home_nr = -1, srv_nr = -1;
2635 #ifdef GPT_ROOT_NATIVE
2636         int root_nr = -1;
2637 #endif
2638 #ifdef GPT_ROOT_SECONDARY
2639         int secondary_root_nr = -1;
2640 #endif
2641         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2642         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2643         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2644         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2645         _cleanup_udev_unref_ struct udev *udev = NULL;
2646         struct udev_list_entry *first, *item;
2647         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2648         const char *pttype = NULL;
2649         blkid_partlist pl;
2650         struct stat st;
2651         int r;
2652         bool is_gpt, is_mbr, multiple_generic = false;
2653
2654         assert(fd >= 0);
2655         assert(root_device);
2656         assert(home_device);
2657         assert(srv_device);
2658         assert(secondary);
2659         assert(arg_image);
2660
2661         b = blkid_new_probe();
2662         if (!b)
2663                 return log_oom();
2664
2665         errno = 0;
2666         r = blkid_probe_set_device(b, fd, 0, 0);
2667         if (r != 0) {
2668                 if (errno == 0)
2669                         return log_oom();
2670
2671                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2672                 return -errno;
2673         }
2674
2675         blkid_probe_enable_partitions(b, 1);
2676         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2677
2678         errno = 0;
2679         r = blkid_do_safeprobe(b);
2680         if (r == -2 || r == 1) {
2681                 log_error("Failed to identify any partition table on\n"
2682                           "    %s\n"
2683                           PARTITION_TABLE_BLURB, arg_image);
2684                 return -EINVAL;
2685         } else if (r != 0) {
2686                 if (errno == 0)
2687                         errno = EIO;
2688                 log_error_errno(errno, "Failed to probe: %m");
2689                 return -errno;
2690         }
2691
2692         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2693
2694         is_gpt = streq_ptr(pttype, "gpt");
2695         is_mbr = streq_ptr(pttype, "dos");
2696
2697         if (!is_gpt && !is_mbr) {
2698                 log_error("No GPT or MBR partition table discovered on\n"
2699                           "    %s\n"
2700                           PARTITION_TABLE_BLURB, arg_image);
2701                 return -EINVAL;
2702         }
2703
2704         errno = 0;
2705         pl = blkid_probe_get_partitions(b);
2706         if (!pl) {
2707                 if (errno == 0)
2708                         return log_oom();
2709
2710                 log_error("Failed to list partitions of %s", arg_image);
2711                 return -errno;
2712         }
2713
2714         udev = udev_new();
2715         if (!udev)
2716                 return log_oom();
2717
2718         if (fstat(fd, &st) < 0)
2719                 return log_error_errno(errno, "Failed to stat block device: %m");
2720
2721         r = wait_for_block_device(udev, st.st_rdev, &d);
2722         if (r < 0)
2723                 return r;
2724
2725         e = udev_enumerate_new(udev);
2726         if (!e)
2727                 return log_oom();
2728
2729         r = udev_enumerate_add_match_parent(e, d);
2730         if (r < 0)
2731                 return log_oom();
2732
2733         r = udev_enumerate_scan_devices(e);
2734         if (r < 0)
2735                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2736
2737         first = udev_enumerate_get_list_entry(e);
2738         udev_list_entry_foreach(item, first) {
2739                 _cleanup_udev_device_unref_ struct udev_device *q;
2740                 const char *node;
2741                 unsigned long long flags;
2742                 blkid_partition pp;
2743                 dev_t qn;
2744                 int nr;
2745
2746                 errno = 0;
2747                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2748                 if (!q) {
2749                         if (!errno)
2750                                 errno = ENOMEM;
2751
2752                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2753                         return -errno;
2754                 }
2755
2756                 qn = udev_device_get_devnum(q);
2757                 if (major(qn) == 0)
2758                         continue;
2759
2760                 if (st.st_rdev == qn)
2761                         continue;
2762
2763                 node = udev_device_get_devnode(q);
2764                 if (!node)
2765                         continue;
2766
2767                 pp = blkid_partlist_devno_to_partition(pl, qn);
2768                 if (!pp)
2769                         continue;
2770
2771                 flags = blkid_partition_get_flags(pp);
2772
2773                 nr = blkid_partition_get_partno(pp);
2774                 if (nr < 0)
2775                         continue;
2776
2777                 if (is_gpt) {
2778                         sd_id128_t type_id;
2779                         const char *stype;
2780
2781                         if (flags & GPT_FLAG_NO_AUTO)
2782                                 continue;
2783
2784                         stype = blkid_partition_get_type_string(pp);
2785                         if (!stype)
2786                                 continue;
2787
2788                         if (sd_id128_from_string(stype, &type_id) < 0)
2789                                 continue;
2790
2791                         if (sd_id128_equal(type_id, GPT_HOME)) {
2792
2793                                 if (home && nr >= home_nr)
2794                                         continue;
2795
2796                                 home_nr = nr;
2797                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2798
2799                                 r = free_and_strdup(&home, node);
2800                                 if (r < 0)
2801                                         return log_oom();
2802
2803                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2804
2805                                 if (srv && nr >= srv_nr)
2806                                         continue;
2807
2808                                 srv_nr = nr;
2809                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2810
2811                                 r = free_and_strdup(&srv, node);
2812                                 if (r < 0)
2813                                         return log_oom();
2814                         }
2815 #ifdef GPT_ROOT_NATIVE
2816                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2817
2818                                 if (root && nr >= root_nr)
2819                                         continue;
2820
2821                                 root_nr = nr;
2822                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2823
2824                                 r = free_and_strdup(&root, node);
2825                                 if (r < 0)
2826                                         return log_oom();
2827                         }
2828 #endif
2829 #ifdef GPT_ROOT_SECONDARY
2830                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2831
2832                                 if (secondary_root && nr >= secondary_root_nr)
2833                                         continue;
2834
2835                                 secondary_root_nr = nr;
2836                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2837
2838                                 r = free_and_strdup(&secondary_root, node);
2839                                 if (r < 0)
2840                                         return log_oom();
2841                         }
2842 #endif
2843                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2844
2845                                 if (generic)
2846                                         multiple_generic = true;
2847                                 else {
2848                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2849
2850                                         r = free_and_strdup(&generic, node);
2851                                         if (r < 0)
2852                                                 return log_oom();
2853                                 }
2854                         }
2855
2856                 } else if (is_mbr) {
2857                         int type;
2858
2859                         if (flags != 0x80) /* Bootable flag */
2860                                 continue;
2861
2862                         type = blkid_partition_get_type(pp);
2863                         if (type != 0x83) /* Linux partition */
2864                                 continue;
2865
2866                         if (generic)
2867                                 multiple_generic = true;
2868                         else {
2869                                 generic_rw = true;
2870
2871                                 r = free_and_strdup(&root, node);
2872                                 if (r < 0)
2873                                         return log_oom();
2874                         }
2875                 }
2876         }
2877
2878         if (root) {
2879                 *root_device = root;
2880                 root = NULL;
2881
2882                 *root_device_rw = root_rw;
2883                 *secondary = false;
2884         } else if (secondary_root) {
2885                 *root_device = secondary_root;
2886                 secondary_root = NULL;
2887
2888                 *root_device_rw = secondary_root_rw;
2889                 *secondary = true;
2890         } else if (generic) {
2891
2892                 /* There were no partitions with precise meanings
2893                  * around, but we found generic partitions. In this
2894                  * case, if there's only one, we can go ahead and boot
2895                  * it, otherwise we bail out, because we really cannot
2896                  * make any sense of it. */
2897
2898                 if (multiple_generic) {
2899                         log_error("Identified multiple bootable Linux partitions on\n"
2900                                   "    %s\n"
2901                                   PARTITION_TABLE_BLURB, arg_image);
2902                         return -EINVAL;
2903                 }
2904
2905                 *root_device = generic;
2906                 generic = NULL;
2907
2908                 *root_device_rw = generic_rw;
2909                 *secondary = false;
2910         } else {
2911                 log_error("Failed to identify root partition in disk image\n"
2912                           "    %s\n"
2913                           PARTITION_TABLE_BLURB, arg_image);
2914                 return -EINVAL;
2915         }
2916
2917         if (home) {
2918                 *home_device = home;
2919                 home = NULL;
2920
2921                 *home_device_rw = home_rw;
2922         }
2923
2924         if (srv) {
2925                 *srv_device = srv;
2926                 srv = NULL;
2927
2928                 *srv_device_rw = srv_rw;
2929         }
2930
2931         return 0;
2932 #else
2933         log_error("--image= is not supported, compiled without blkid support.");
2934         return -ENOTSUP;
2935 #endif
2936 }
2937
2938 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2939 #ifdef HAVE_BLKID
2940         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2941         const char *fstype, *p;
2942         int r;
2943
2944         assert(what);
2945         assert(where);
2946
2947         if (arg_read_only)
2948                 rw = false;
2949
2950         if (directory)
2951                 p = strappenda(where, directory);
2952         else
2953                 p = where;
2954
2955         errno = 0;
2956         b = blkid_new_probe_from_filename(what);
2957         if (!b) {
2958                 if (errno == 0)
2959                         return log_oom();
2960                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2961                 return -errno;
2962         }
2963
2964         blkid_probe_enable_superblocks(b, 1);
2965         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2966
2967         errno = 0;
2968         r = blkid_do_safeprobe(b);
2969         if (r == -1 || r == 1) {
2970                 log_error("Cannot determine file system type of %s", what);
2971                 return -EINVAL;
2972         } else if (r != 0) {
2973                 if (errno == 0)
2974                         errno = EIO;
2975                 log_error_errno(errno, "Failed to probe %s: %m", what);
2976                 return -errno;
2977         }
2978
2979         errno = 0;
2980         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2981                 if (errno == 0)
2982                         errno = EINVAL;
2983                 log_error("Failed to determine file system type of %s", what);
2984                 return -errno;
2985         }
2986
2987         if (streq(fstype, "crypto_LUKS")) {
2988                 log_error("nspawn currently does not support LUKS disk images.");
2989                 return -ENOTSUP;
2990         }
2991
2992         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2993                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2994
2995         return 0;
2996 #else
2997         log_error("--image= is not supported, compiled without blkid support.");
2998         return -ENOTSUP;
2999 #endif
3000 }
3001
3002 static int mount_devices(
3003                 const char *where,
3004                 const char *root_device, bool root_device_rw,
3005                 const char *home_device, bool home_device_rw,
3006                 const char *srv_device, bool srv_device_rw) {
3007         int r;
3008
3009         assert(where);
3010
3011         if (root_device) {
3012                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3013                 if (r < 0)
3014                         return log_error_errno(r, "Failed to mount root directory: %m");
3015         }
3016
3017         if (home_device) {
3018                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3019                 if (r < 0)
3020                         return log_error_errno(r, "Failed to mount home directory: %m");
3021         }
3022
3023         if (srv_device) {
3024                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3025                 if (r < 0)
3026                         return log_error_errno(r, "Failed to mount server data directory: %m");
3027         }
3028
3029         return 0;
3030 }
3031
3032 static void loop_remove(int nr, int *image_fd) {
3033         _cleanup_close_ int control = -1;
3034         int r;
3035
3036         if (nr < 0)
3037                 return;
3038
3039         if (image_fd && *image_fd >= 0) {
3040                 r = ioctl(*image_fd, LOOP_CLR_FD);
3041                 if (r < 0)
3042                         log_debug_errno(errno, "Failed to close loop image: %m");
3043                 *image_fd = safe_close(*image_fd);
3044         }
3045
3046         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3047         if (control < 0) {
3048                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3049                 return;
3050         }
3051
3052         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3053         if (r < 0)
3054                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3055 }
3056
3057 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3058         int pipe_fds[2];
3059         pid_t pid;
3060
3061         assert(database);
3062         assert(key);
3063         assert(rpid);
3064
3065         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3066                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3067
3068         pid = fork();
3069         if (pid < 0)
3070                 return log_error_errno(errno, "Failed to fork getent child: %m");
3071         else if (pid == 0) {
3072                 int nullfd;
3073                 char *empty_env = NULL;
3074
3075                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3076                         _exit(EXIT_FAILURE);
3077
3078                 if (pipe_fds[0] > 2)
3079                         safe_close(pipe_fds[0]);
3080                 if (pipe_fds[1] > 2)
3081                         safe_close(pipe_fds[1]);
3082
3083                 nullfd = open("/dev/null", O_RDWR);
3084                 if (nullfd < 0)
3085                         _exit(EXIT_FAILURE);
3086
3087                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3088                         _exit(EXIT_FAILURE);
3089
3090                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3091                         _exit(EXIT_FAILURE);
3092
3093                 if (nullfd > 2)
3094                         safe_close(nullfd);
3095
3096                 reset_all_signal_handlers();
3097                 close_all_fds(NULL, 0);
3098
3099                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3100                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3101                 _exit(EXIT_FAILURE);
3102         }
3103
3104         pipe_fds[1] = safe_close(pipe_fds[1]);
3105
3106         *rpid = pid;
3107
3108         return pipe_fds[0];
3109 }
3110
3111 static int change_uid_gid(char **_home) {
3112         char line[LINE_MAX], *x, *u, *g, *h;
3113         const char *word, *state;
3114         _cleanup_free_ uid_t *uids = NULL;
3115         _cleanup_free_ char *home = NULL;
3116         _cleanup_fclose_ FILE *f = NULL;
3117         _cleanup_close_ int fd = -1;
3118         unsigned n_uids = 0;
3119         size_t sz = 0, l;
3120         uid_t uid;
3121         gid_t gid;
3122         pid_t pid;
3123         int r;
3124
3125         assert(_home);
3126
3127         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3128                 /* Reset everything fully to 0, just in case */
3129
3130                 if (setgroups(0, NULL) < 0)
3131                         return log_error_errno(errno, "setgroups() failed: %m");
3132
3133                 if (setresgid(0, 0, 0) < 0)
3134                         return log_error_errno(errno, "setregid() failed: %m");
3135
3136                 if (setresuid(0, 0, 0) < 0)
3137                         return log_error_errno(errno, "setreuid() failed: %m");
3138
3139                 *_home = NULL;
3140                 return 0;
3141         }
3142
3143         /* First, get user credentials */
3144         fd = spawn_getent("passwd", arg_user, &pid);
3145         if (fd < 0)
3146                 return fd;
3147
3148         f = fdopen(fd, "r");
3149         if (!f)
3150                 return log_oom();
3151         fd = -1;
3152
3153         if (!fgets(line, sizeof(line), f)) {
3154
3155                 if (!ferror(f)) {
3156                         log_error("Failed to resolve user %s.", arg_user);
3157                         return -ESRCH;
3158                 }
3159
3160                 log_error_errno(errno, "Failed to read from getent: %m");
3161                 return -errno;
3162         }
3163
3164         truncate_nl(line);
3165
3166         wait_for_terminate_and_warn("getent passwd", pid, true);
3167
3168         x = strchr(line, ':');
3169         if (!x) {
3170                 log_error("/etc/passwd entry has invalid user field.");
3171                 return -EIO;
3172         }
3173
3174         u = strchr(x+1, ':');
3175         if (!u) {
3176                 log_error("/etc/passwd entry has invalid password field.");
3177                 return -EIO;
3178         }
3179
3180         u++;
3181         g = strchr(u, ':');
3182         if (!g) {
3183                 log_error("/etc/passwd entry has invalid UID field.");
3184                 return -EIO;
3185         }
3186
3187         *g = 0;
3188         g++;
3189         x = strchr(g, ':');
3190         if (!x) {
3191                 log_error("/etc/passwd entry has invalid GID field.");
3192                 return -EIO;
3193         }
3194
3195         *x = 0;
3196         h = strchr(x+1, ':');
3197         if (!h) {
3198                 log_error("/etc/passwd entry has invalid GECOS field.");
3199                 return -EIO;
3200         }
3201
3202         h++;
3203         x = strchr(h, ':');
3204         if (!x) {
3205                 log_error("/etc/passwd entry has invalid home directory field.");
3206                 return -EIO;
3207         }
3208
3209         *x = 0;
3210
3211         r = parse_uid(u, &uid);
3212         if (r < 0) {
3213                 log_error("Failed to parse UID of user.");
3214                 return -EIO;
3215         }
3216
3217         r = parse_gid(g, &gid);
3218         if (r < 0) {
3219                 log_error("Failed to parse GID of user.");
3220                 return -EIO;
3221         }
3222
3223         home = strdup(h);
3224         if (!home)
3225                 return log_oom();
3226
3227         /* Second, get group memberships */
3228         fd = spawn_getent("initgroups", arg_user, &pid);
3229         if (fd < 0)
3230                 return fd;
3231
3232         fclose(f);
3233         f = fdopen(fd, "r");
3234         if (!f)
3235                 return log_oom();
3236         fd = -1;
3237
3238         if (!fgets(line, sizeof(line), f)) {
3239                 if (!ferror(f)) {
3240                         log_error("Failed to resolve user %s.", arg_user);
3241                         return -ESRCH;
3242                 }
3243
3244                 log_error_errno(errno, "Failed to read from getent: %m");
3245                 return -errno;
3246         }
3247
3248         truncate_nl(line);
3249
3250         wait_for_terminate_and_warn("getent initgroups", pid, true);
3251
3252         /* Skip over the username and subsequent separator whitespace */
3253         x = line;
3254         x += strcspn(x, WHITESPACE);
3255         x += strspn(x, WHITESPACE);
3256
3257         FOREACH_WORD(word, l, x, state) {
3258                 char c[l+1];
3259
3260                 memcpy(c, word, l);
3261                 c[l] = 0;
3262
3263                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3264                         return log_oom();
3265
3266                 r = parse_uid(c, &uids[n_uids++]);
3267                 if (r < 0) {
3268                         log_error("Failed to parse group data from getent.");
3269                         return -EIO;
3270                 }
3271         }
3272
3273         r = mkdir_parents(home, 0775);
3274         if (r < 0)
3275                 return log_error_errno(r, "Failed to make home root directory: %m");
3276
3277         r = mkdir_safe(home, 0755, uid, gid);
3278         if (r < 0 && r != -EEXIST)
3279                 return log_error_errno(r, "Failed to make home directory: %m");
3280
3281         fchown(STDIN_FILENO, uid, gid);
3282         fchown(STDOUT_FILENO, uid, gid);
3283         fchown(STDERR_FILENO, uid, gid);
3284
3285         if (setgroups(n_uids, uids) < 0)
3286                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3287
3288         if (setresgid(gid, gid, gid) < 0)
3289                 return log_error_errno(errno, "setregid() failed: %m");
3290
3291         if (setresuid(uid, uid, uid) < 0)
3292                 return log_error_errno(errno, "setreuid() failed: %m");
3293
3294         if (_home) {
3295                 *_home = home;
3296                 home = NULL;
3297         }
3298
3299         return 0;
3300 }
3301
3302 /*
3303  * Return values:
3304  * < 0 : wait_for_terminate() failed to get the state of the
3305  *       container, the container was terminated by a signal, or
3306  *       failed for an unknown reason.  No change is made to the
3307  *       container argument.
3308  * > 0 : The program executed in the container terminated with an
3309  *       error.  The exit code of the program executed in the
3310  *       container is returned.  The container argument has been set
3311  *       to CONTAINER_TERMINATED.
3312  *   0 : The container is being rebooted, has been shut down or exited
3313  *       successfully.  The container argument has been set to either
3314  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3315  *
3316  * That is, success is indicated by a return value of zero, and an
3317  * error is indicated by a non-zero value.
3318  */
3319 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3320         siginfo_t status;
3321         int r;
3322
3323         r = wait_for_terminate(pid, &status);
3324         if (r < 0)
3325                 return log_warning_errno(r, "Failed to wait for container: %m");
3326
3327         switch (status.si_code) {
3328
3329         case CLD_EXITED:
3330                 if (status.si_status == 0) {
3331                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3332
3333                 } else
3334                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3335
3336                 *container = CONTAINER_TERMINATED;
3337                 return status.si_status;
3338
3339         case CLD_KILLED:
3340                 if (status.si_status == SIGINT) {
3341
3342                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3343                         *container = CONTAINER_TERMINATED;
3344                         return 0;
3345
3346                 } else if (status.si_status == SIGHUP) {
3347
3348                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3349                         *container = CONTAINER_REBOOTED;
3350                         return 0;
3351                 }
3352
3353                 /* CLD_KILLED fallthrough */
3354
3355         case CLD_DUMPED:
3356                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3357                 return -EIO;
3358
3359         default:
3360                 log_error("Container %s failed due to unknown reason.", arg_machine);
3361                 return -EIO;
3362         }
3363
3364         return r;
3365 }
3366
3367 static void nop_handler(int sig) {}
3368
3369 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3370         pid_t pid;
3371
3372         pid = PTR_TO_UINT32(userdata);
3373         if (pid > 0) {
3374                 if (kill(pid, SIGRTMIN+3) >= 0) {
3375                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3376                         sd_event_source_set_userdata(s, NULL);
3377                         return 0;
3378                 }
3379         }
3380
3381         sd_event_exit(sd_event_source_get_event(s), 0);
3382         return 0;
3383 }
3384
3385 static int determine_names(void) {
3386         int r;
3387
3388         if (!arg_image && !arg_directory) {
3389                 if (arg_machine) {
3390                         _cleanup_(image_unrefp) Image *i = NULL;
3391
3392                         r = image_find(arg_machine, &i);
3393                         if (r < 0)
3394                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3395                         else if (r == 0) {
3396                                 log_error("No image for machine '%s': %m", arg_machine);
3397                                 return -ENOENT;
3398                         }
3399
3400                         if (i->type == IMAGE_RAW)
3401                                 r = set_sanitized_path(&arg_image, i->path);
3402                         else
3403                                 r = set_sanitized_path(&arg_directory, i->path);
3404                         if (r < 0)
3405                                 return log_error_errno(r, "Invalid image directory: %m");
3406
3407                         arg_read_only = arg_read_only || i->read_only;
3408                 } else
3409                         arg_directory = get_current_dir_name();
3410
3411                 if (!arg_directory && !arg_machine) {
3412                         log_error("Failed to determine path, please use -D or -i.");
3413                         return -EINVAL;
3414                 }
3415         }
3416
3417         if (!arg_machine) {
3418                 if (arg_directory && path_equal(arg_directory, "/"))
3419                         arg_machine = gethostname_malloc();
3420                 else
3421                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3422
3423                 if (!arg_machine)
3424                         return log_oom();
3425
3426                 hostname_cleanup(arg_machine, false);
3427                 if (!machine_name_is_valid(arg_machine)) {
3428                         log_error("Failed to determine machine name automatically, please use -M.");
3429                         return -EINVAL;
3430                 }
3431
3432                 if (arg_ephemeral) {
3433                         char *b;
3434
3435                         /* Add a random suffix when this is an
3436                          * ephemeral machine, so that we can run many
3437                          * instances at once without manually having
3438                          * to specify -M each time. */
3439
3440                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3441                                 return log_oom();
3442
3443                         free(arg_machine);
3444                         arg_machine = b;
3445                 }
3446         }
3447
3448         return 0;
3449 }
3450
3451 int main(int argc, char *argv[]) {
3452
3453         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3454         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3455         _cleanup_close_ int master = -1, image_fd = -1;
3456         _cleanup_fdset_free_ FDSet *fds = NULL;
3457         int r, n_fd_passed, loop_nr = -1;
3458         char veth_name[IFNAMSIZ];
3459         bool secondary = false, remove_subvol = false;
3460         sigset_t mask, mask_chld;
3461         pid_t pid = 0;
3462         int ret = EXIT_SUCCESS;
3463         union in_addr_union exposed = {};
3464         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3465
3466         log_parse_environment();
3467         log_open();
3468
3469         r = parse_argv(argc, argv);
3470         if (r <= 0)
3471                 goto finish;
3472
3473         r = determine_names();
3474         if (r < 0)
3475                 goto finish;
3476
3477         if (geteuid() != 0) {
3478                 log_error("Need to be root.");
3479                 r = -EPERM;
3480                 goto finish;
3481         }
3482
3483         if (sd_booted() <= 0) {
3484                 log_error("Not running on a systemd system.");
3485                 r = -EINVAL;
3486                 goto finish;
3487         }
3488
3489         log_close();
3490         n_fd_passed = sd_listen_fds(false);
3491         if (n_fd_passed > 0) {
3492                 r = fdset_new_listen_fds(&fds, false);
3493                 if (r < 0) {
3494                         log_error_errno(r, "Failed to collect file descriptors: %m");
3495                         goto finish;
3496                 }
3497         }
3498         fdset_close_others(fds);
3499         log_open();
3500
3501         if (arg_directory) {
3502                 assert(!arg_image);
3503
3504                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3505                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3506                         r = -EINVAL;
3507                         goto finish;
3508                 }
3509
3510                 if (arg_ephemeral) {
3511                         _cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
3512                         char *np;
3513
3514                         /* If the specified path is a mount point we
3515                          * generate the new snapshot immediately
3516                          * inside it under a random name. However if
3517                          * the specified is not a mount point we
3518                          * create the new snapshot in the parent
3519                          * directory, just next to it. */
3520                         r = path_is_mount_point(arg_directory, false);
3521                         if (r < 0) {
3522                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3523                                 goto finish;
3524                         }
3525                         if (r > 0)
3526                                 r = tempfn_random_child(arg_directory, &np);
3527                         else
3528                                 r = tempfn_random(arg_directory, &np);
3529                         if (r < 0) {
3530                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3531                                 goto finish;
3532                         }
3533
3534                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3535                         if (r < 0) {
3536                                 log_error_errno(r, "Failed to lock %s: %m", np);
3537                                 goto finish;
3538                         }
3539
3540                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3541                         if (r < 0) {
3542                                 free(np);
3543                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3544                                 goto finish;
3545                         }
3546
3547                         free(arg_directory);
3548                         arg_directory = np;
3549
3550                         remove_subvol = true;
3551
3552                 } else {
3553                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3554                         if (r == -EBUSY) {
3555                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3556                                 goto finish;
3557                         }
3558                         if (r < 0) {
3559                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3560                                 return r;
3561                         }
3562
3563                         if (arg_template) {
3564                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3565                                 if (r == -EEXIST) {
3566                                         if (!arg_quiet)
3567                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3568                                 } else if (r < 0) {
3569                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3570                                         goto finish;
3571                                 } else {
3572                                         if (!arg_quiet)
3573                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3574                                 }
3575                         }
3576                 }
3577
3578                 if (arg_boot) {
3579                         if (path_is_os_tree(arg_directory) <= 0) {
3580                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3581                                 r = -EINVAL;
3582                                 goto finish;
3583                         }
3584                 } else {
3585                         const char *p;
3586
3587                         p = strappenda(arg_directory,
3588                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3589                         if (access(p, F_OK) < 0) {
3590                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3591                                 r = -EINVAL;
3592                                 goto finish;
3593                         }
3594                 }
3595
3596         } else {
3597                 char template[] = "/tmp/nspawn-root-XXXXXX";
3598
3599                 assert(arg_image);
3600                 assert(!arg_template);
3601
3602                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3603                 if (r == -EBUSY) {
3604                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3605                         goto finish;
3606                 }
3607                 if (r < 0) {
3608                         r = log_error_errno(r, "Failed to create image lock: %m");
3609                         goto finish;
3610                 }
3611
3612                 if (!mkdtemp(template)) {
3613                         log_error_errno(errno, "Failed to create temporary directory: %m");
3614                         r = -errno;
3615                         goto finish;
3616                 }
3617
3618                 arg_directory = strdup(template);
3619                 if (!arg_directory) {
3620                         r = log_oom();
3621                         goto finish;
3622                 }
3623
3624                 image_fd = setup_image(&device_path, &loop_nr);
3625                 if (image_fd < 0) {
3626                         r = image_fd;
3627                         goto finish;
3628                 }
3629
3630                 r = dissect_image(image_fd,
3631                                   &root_device, &root_device_rw,
3632                                   &home_device, &home_device_rw,
3633                                   &srv_device, &srv_device_rw,
3634                                   &secondary);
3635                 if (r < 0)
3636                         goto finish;
3637         }
3638
3639         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3640         if (master < 0) {
3641                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3642                 goto finish;
3643         }
3644
3645         r = ptsname_malloc(master, &console);
3646         if (r < 0) {
3647                 r = log_error_errno(r, "Failed to determine tty name: %m");
3648                 goto finish;
3649         }
3650
3651         if (!arg_quiet)
3652                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3653                          arg_machine, arg_image ?: arg_directory);
3654
3655         if (unlockpt(master) < 0) {
3656                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3657                 goto finish;
3658         }
3659
3660         assert_se(sigemptyset(&mask) == 0);
3661         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3662         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3663
3664         assert_se(sigemptyset(&mask_chld) == 0);
3665         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3666
3667         for (;;) {
3668                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3669                 ContainerStatus container_status;
3670                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3671                 struct sigaction sa = {
3672                         .sa_handler = nop_handler,
3673                         .sa_flags = SA_NOCLDSTOP,
3674                 };
3675
3676                 r = barrier_create(&barrier);
3677                 if (r < 0) {
3678                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3679                         goto finish;
3680                 }
3681
3682                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3683                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3684                         goto finish;
3685                 }
3686
3687                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3688                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3689                         goto finish;
3690                 }
3691
3692                 /* Child can be killed before execv(), so handle SIGCHLD
3693                  * in order to interrupt parent's blocking calls and
3694                  * give it a chance to call wait() and terminate. */
3695                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3696                 if (r < 0) {
3697                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3698                         goto finish;
3699                 }
3700
3701                 r = sigaction(SIGCHLD, &sa, NULL);
3702                 if (r < 0) {
3703                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3704                         goto finish;
3705                 }
3706
3707                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3708                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3709                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3710                 if (pid < 0) {
3711                         if (errno == EINVAL)
3712                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3713                         else
3714                                 r = log_error_errno(errno, "clone() failed: %m");
3715
3716                         goto finish;
3717                 }
3718
3719                 if (pid == 0) {
3720                         /* child */
3721                         _cleanup_free_ char *home = NULL;
3722                         unsigned n_env = 2;
3723                         const char *envp[] = {
3724                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3725                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3726                                 NULL, /* TERM */
3727                                 NULL, /* HOME */
3728                                 NULL, /* USER */
3729                                 NULL, /* LOGNAME */
3730                                 NULL, /* container_uuid */
3731                                 NULL, /* LISTEN_FDS */
3732                                 NULL, /* LISTEN_PID */
3733                                 NULL
3734                         };
3735                         char **env_use;
3736
3737                         barrier_set_role(&barrier, BARRIER_CHILD);
3738
3739                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3740                         if (envp[n_env])
3741                                 n_env ++;
3742
3743                         master = safe_close(master);
3744
3745                         close_nointr(STDIN_FILENO);
3746                         close_nointr(STDOUT_FILENO);
3747                         close_nointr(STDERR_FILENO);
3748
3749                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3750                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3751
3752                         reset_all_signal_handlers();
3753                         reset_signal_mask();
3754
3755                         r = open_terminal(console, O_RDWR);
3756                         if (r != STDIN_FILENO) {
3757                                 if (r >= 0) {
3758                                         safe_close(r);
3759                                         r = -EINVAL;
3760                                 }
3761
3762                                 log_error_errno(r, "Failed to open console: %m");
3763                                 _exit(EXIT_FAILURE);
3764                         }
3765
3766                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3767                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3768                                 log_error_errno(errno, "Failed to duplicate console: %m");
3769                                 _exit(EXIT_FAILURE);
3770                         }
3771
3772                         if (setsid() < 0) {
3773                                 log_error_errno(errno, "setsid() failed: %m");
3774                                 _exit(EXIT_FAILURE);
3775                         }
3776
3777                         if (reset_audit_loginuid() < 0)
3778                                 _exit(EXIT_FAILURE);
3779
3780                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3781                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3782                                 _exit(EXIT_FAILURE);
3783                         }
3784
3785                         /* Mark everything as slave, so that we still
3786                          * receive mounts from the real root, but don't
3787                          * propagate mounts to the real root. */
3788                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3789                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3790                                 _exit(EXIT_FAILURE);
3791                         }
3792
3793                         if (mount_devices(arg_directory,
3794                                           root_device, root_device_rw,
3795                                           home_device, home_device_rw,
3796                                           srv_device, srv_device_rw) < 0)
3797                                 _exit(EXIT_FAILURE);
3798
3799                         /* Turn directory into bind mount */
3800                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3801                                 log_error_errno(errno, "Failed to make bind mount: %m");
3802                                 _exit(EXIT_FAILURE);
3803                         }
3804
3805                         r = setup_volatile(arg_directory);
3806                         if (r < 0)
3807                                 _exit(EXIT_FAILURE);
3808
3809                         if (setup_volatile_state(arg_directory) < 0)
3810                                 _exit(EXIT_FAILURE);
3811
3812                         r = base_filesystem_create(arg_directory);
3813                         if (r < 0)
3814                                 _exit(EXIT_FAILURE);
3815
3816                         if (arg_read_only) {
3817                                 r = bind_remount_recursive(arg_directory, true);
3818                                 if (r < 0) {
3819                                         log_error_errno(r, "Failed to make tree read-only: %m");
3820                                         _exit(EXIT_FAILURE);
3821                                 }
3822                         }
3823
3824                         if (mount_all(arg_directory) < 0)
3825                                 _exit(EXIT_FAILURE);
3826
3827                         if (copy_devnodes(arg_directory) < 0)
3828                                 _exit(EXIT_FAILURE);
3829
3830                         if (setup_ptmx(arg_directory) < 0)
3831                                 _exit(EXIT_FAILURE);
3832
3833                         dev_setup(arg_directory);
3834
3835                         if (setup_propagate(arg_directory) < 0)
3836                                 _exit(EXIT_FAILURE);
3837
3838                         if (setup_seccomp() < 0)
3839                                 _exit(EXIT_FAILURE);
3840
3841                         if (setup_dev_console(arg_directory, console) < 0)
3842                                 _exit(EXIT_FAILURE);
3843
3844                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3845                                 _exit(EXIT_FAILURE);
3846                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3847
3848                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3849                                 _exit(EXIT_FAILURE);
3850                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3851
3852                         /* Tell the parent that we are ready, and that
3853                          * it can cgroupify us to that we lack access
3854                          * to certain devices and resources. */
3855                         (void) barrier_place(&barrier);
3856
3857                         if (setup_boot_id(arg_directory) < 0)
3858                                 _exit(EXIT_FAILURE);
3859
3860                         if (setup_timezone(arg_directory) < 0)
3861                                 _exit(EXIT_FAILURE);
3862
3863                         if (setup_resolv_conf(arg_directory) < 0)
3864                                 _exit(EXIT_FAILURE);
3865
3866                         if (setup_journal(arg_directory) < 0)
3867                                 _exit(EXIT_FAILURE);
3868
3869                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3870                                 _exit(EXIT_FAILURE);
3871
3872                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3873                                 _exit(EXIT_FAILURE);
3874
3875                         if (mount_tmpfs(arg_directory) < 0)
3876                                 _exit(EXIT_FAILURE);
3877
3878                         /* Wait until we are cgroup-ified, so that we
3879                          * can mount the right cgroup path writable */
3880                         (void) barrier_sync_next(&barrier);
3881
3882                         if (mount_cgroup(arg_directory) < 0)
3883                                 _exit(EXIT_FAILURE);
3884
3885                         if (chdir(arg_directory) < 0) {
3886                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3887                                 _exit(EXIT_FAILURE);
3888                         }
3889
3890                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3891                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3892                                 _exit(EXIT_FAILURE);
3893                         }
3894
3895                         if (chroot(".") < 0) {
3896                                 log_error_errno(errno, "chroot() failed: %m");
3897                                 _exit(EXIT_FAILURE);
3898                         }
3899
3900                         if (chdir("/") < 0) {
3901                                 log_error_errno(errno, "chdir() failed: %m");
3902                                 _exit(EXIT_FAILURE);
3903                         }
3904
3905                         umask(0022);
3906
3907                         if (arg_private_network)
3908                                 loopback_setup();
3909
3910                         if (drop_capabilities() < 0) {
3911                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3912                                 _exit(EXIT_FAILURE);
3913                         }
3914
3915                         r = change_uid_gid(&home);
3916                         if (r < 0)
3917                                 _exit(EXIT_FAILURE);
3918
3919                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3920                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3921                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3922                                 log_oom();
3923                                 _exit(EXIT_FAILURE);
3924                         }
3925
3926                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3927                                 char as_uuid[37];
3928
3929                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3930                                         log_oom();
3931                                         _exit(EXIT_FAILURE);
3932                                 }
3933                         }
3934
3935                         if (fdset_size(fds) > 0) {
3936                                 r = fdset_cloexec(fds, false);
3937                                 if (r < 0) {
3938                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3939                                         _exit(EXIT_FAILURE);
3940                                 }
3941
3942                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3943                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3944                                         log_oom();
3945                                         _exit(EXIT_FAILURE);
3946                                 }
3947                         }
3948
3949                         setup_hostname();
3950
3951                         if (arg_personality != 0xffffffffLU) {
3952                                 if (personality(arg_personality) < 0) {
3953                                         log_error_errno(errno, "personality() failed: %m");
3954                                         _exit(EXIT_FAILURE);
3955                                 }
3956                         } else if (secondary) {
3957                                 if (personality(PER_LINUX32) < 0) {
3958                                         log_error_errno(errno, "personality() failed: %m");
3959                                         _exit(EXIT_FAILURE);
3960                                 }
3961                         }
3962
3963 #ifdef HAVE_SELINUX
3964                         if (arg_selinux_context)
3965                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3966                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3967                                         _exit(EXIT_FAILURE);
3968                                 }
3969 #endif
3970
3971                         if (!strv_isempty(arg_setenv)) {
3972                                 char **n;
3973
3974                                 n = strv_env_merge(2, envp, arg_setenv);
3975                                 if (!n) {
3976                                         log_oom();
3977                                         _exit(EXIT_FAILURE);
3978                                 }
3979
3980                                 env_use = n;
3981                         } else
3982                                 env_use = (char**) envp;
3983
3984                         /* Wait until the parent is ready with the setup, too... */
3985                         if (!barrier_place_and_sync(&barrier))
3986                                 _exit(EXIT_FAILURE);
3987
3988                         if (arg_boot) {
3989                                 char **a;
3990                                 size_t l;
3991
3992                                 /* Automatically search for the init system */
3993
3994                                 l = 1 + argc - optind;
3995                                 a = newa(char*, l + 1);
3996                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3997
3998                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3999                                 execve(a[0], a, env_use);
4000
4001                                 a[0] = (char*) "/lib/systemd/systemd";
4002                                 execve(a[0], a, env_use);
4003
4004                                 a[0] = (char*) "/sbin/init";
4005                                 execve(a[0], a, env_use);
4006                         } else if (argc > optind)
4007                                 execvpe(argv[optind], argv + optind, env_use);
4008                         else {
4009                                 chdir(home ? home : "/root");
4010                                 execle("/bin/bash", "-bash", NULL, env_use);
4011                                 execle("/bin/sh", "-sh", NULL, env_use);
4012                         }
4013
4014                         log_error_errno(errno, "execv() failed: %m");
4015                         _exit(EXIT_FAILURE);
4016                 }
4017
4018                 barrier_set_role(&barrier, BARRIER_PARENT);
4019                 fdset_free(fds);
4020                 fds = NULL;
4021
4022                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4023                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4024
4025                 /* Wait for the most basic Child-setup to be done,
4026                  * before we add hardware to it, and place it in a
4027                  * cgroup. */
4028                 if (barrier_sync_next(&barrier)) {
4029                         int ifi = 0;
4030
4031                         r = move_network_interfaces(pid);
4032                         if (r < 0)
4033                                 goto finish;
4034
4035                         r = setup_veth(pid, veth_name, &ifi);
4036                         if (r < 0)
4037                                 goto finish;
4038
4039                         r = setup_bridge(veth_name, &ifi);
4040                         if (r < 0)
4041                                 goto finish;
4042
4043                         r = setup_macvlan(pid);
4044                         if (r < 0)
4045                                 goto finish;
4046
4047                         r = register_machine(pid, ifi);
4048                         if (r < 0)
4049                                 goto finish;
4050
4051                         /* Block SIGCHLD here, before notifying child.
4052                          * process_pty() will handle it with the other signals. */
4053                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4054                         if (r < 0)
4055                                 goto finish;
4056
4057                         /* Reset signal to default */
4058                         r = default_signals(SIGCHLD, -1);
4059                         if (r < 0)
4060                                 goto finish;
4061
4062                         /* Notify the child that the parent is ready with all
4063                          * its setup, and that the child can now hand over
4064                          * control to the code to run inside the container. */
4065                         (void) barrier_place(&barrier);
4066
4067                         /* And wait that the child is completely ready now. */
4068                         if (barrier_place_and_sync(&barrier)) {
4069                                 _cleanup_event_unref_ sd_event *event = NULL;
4070                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4071                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4072                                 char last_char = 0;
4073
4074                                 sd_notifyf(false,
4075                                            "READY=1\n"
4076                                            "STATUS=Container running.\n"
4077                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4078
4079                                 r = sd_event_new(&event);
4080                                 if (r < 0) {
4081                                         log_error_errno(r, "Failed to get default event source: %m");
4082                                         goto finish;
4083                                 }
4084
4085                                 if (arg_boot) {
4086                                         /* Try to kill the init system on SIGINT or SIGTERM */
4087                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4088                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4089                                 } else {
4090                                         /* Immediately exit */
4091                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4092                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4093                                 }
4094
4095                                 /* simply exit on sigchld */
4096                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4097
4098                                 if (arg_expose_ports) {
4099                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4100                                         if (r < 0)
4101                                                 goto finish;
4102
4103                                         (void) expose_ports(rtnl, &exposed);
4104                                 }
4105
4106                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4107
4108                                 r = pty_forward_new(event, master, true, &forward);
4109                                 if (r < 0) {
4110                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4111                                         goto finish;
4112                                 }
4113
4114                                 r = sd_event_loop(event);
4115                                 if (r < 0) {
4116                                         log_error_errno(r, "Failed to run event loop: %m");
4117                                         goto finish;
4118                                 }
4119
4120                                 pty_forward_get_last_char(forward, &last_char);
4121
4122                                 forward = pty_forward_free(forward);
4123
4124                                 if (!arg_quiet && last_char != '\n')
4125                                         putc('\n', stdout);
4126
4127                                 /* Kill if it is not dead yet anyway */
4128                                 terminate_machine(pid);
4129                         }
4130                 }
4131
4132                 /* Normally redundant, but better safe than sorry */
4133                 kill(pid, SIGKILL);
4134
4135                 r = wait_for_container(pid, &container_status);
4136                 pid = 0;
4137
4138                 if (r < 0)
4139                         /* We failed to wait for the container, or the
4140                          * container exited abnormally */
4141                         goto finish;
4142                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4143                         /* The container exited with a non-zero
4144                          * status, or with zero status and no reboot
4145                          * was requested. */
4146                         ret = r;
4147                         break;
4148                 }
4149
4150                 /* CONTAINER_REBOOTED, loop again */
4151
4152                 if (arg_keep_unit) {
4153                         /* Special handling if we are running as a
4154                          * service: instead of simply restarting the
4155                          * machine we want to restart the entire
4156                          * service, so let's inform systemd about this
4157                          * with the special exit code 133. The service
4158                          * file uses RestartForceExitStatus=133 so
4159                          * that this results in a full nspawn
4160                          * restart. This is necessary since we might
4161                          * have cgroup parameters set we want to have
4162                          * flushed out. */
4163                         ret = 133;
4164                         r = 0;
4165                         break;
4166                 }
4167
4168                 flush_ports(&exposed);
4169         }
4170
4171 finish:
4172         sd_notify(false,
4173                   "STOPPING=1\n"
4174                   "STATUS=Terminating...");
4175
4176         loop_remove(loop_nr, &image_fd);
4177
4178         if (pid > 0)
4179                 kill(pid, SIGKILL);
4180
4181         if (remove_subvol && arg_directory) {
4182                 int k;
4183
4184                 k = btrfs_subvol_remove(arg_directory);
4185                 if (k < 0)
4186                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4187         }
4188
4189         if (arg_machine) {
4190                 const char *p;
4191
4192                 p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
4193                 (void) rm_rf(p, false, true, false);
4194         }
4195
4196         free(arg_directory);
4197         free(arg_template);
4198         free(arg_image);
4199         free(arg_machine);
4200         free(arg_user);
4201         strv_free(arg_setenv);
4202         strv_free(arg_network_interfaces);
4203         strv_free(arg_network_macvlan);
4204         strv_free(arg_bind);
4205         strv_free(arg_bind_ro);
4206         strv_free(arg_tmpfs);
4207
4208         flush_ports(&exposed);
4209
4210         while (arg_expose_ports) {
4211                 ExposePort *p = arg_expose_ports;
4212                 LIST_REMOVE(ports, arg_expose_ports, p);
4213                 free(p);
4214         }
4215
4216         return r < 0 ? EXIT_FAILURE : ret;
4217 }