chiark / gitweb /
2b1feb6aa3776980909da52bb4ef92bb46b6e168
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #ifdef HAVE_BLKID
56 #include <blkid/blkid.h>
57 #endif
58
59 #include "sd-daemon.h"
60 #include "sd-bus.h"
61 #include "sd-id128.h"
62 #include "sd-rtnl.h"
63 #include "log.h"
64 #include "util.h"
65 #include "mkdir.h"
66 #include "macro.h"
67 #include "audit.h"
68 #include "missing.h"
69 #include "cgroup-util.h"
70 #include "strv.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
74 #include "fdset.h"
75 #include "build.h"
76 #include "fileio.h"
77 #include "bus-util.h"
78 #include "bus-error.h"
79 #include "ptyfwd.h"
80 #include "bus-kernel.h"
81 #include "env-util.h"
82 #include "def.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
86 #include "gpt.h"
87 #include "siphash24.h"
88 #include "copy.h"
89 #include "base-filesystem.h"
90 #include "barrier.h"
91 #include "event-util.h"
92 #include "capability.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
96 #include "list.h"
97 #include "in-addr-util.h"
98 #include "fw-util.h"
99 #include "local-addresses.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106         int protocol;
107         uint16_t host_port;
108         uint16_t container_port;
109         LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113         CONTAINER_TERMINATED,
114         CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118         LINK_NO,
119         LINK_AUTO,
120         LINK_HOST,
121         LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125         VOLATILE_NO,
126         VOLATILE_YES,
127         VOLATILE_STATE,
128 } Volatile;
129
130 static char *arg_directory = NULL;
131 static char *arg_template = NULL;
132 static char *arg_user = NULL;
133 static sd_id128_t arg_uuid = {};
134 static char *arg_machine = NULL;
135 static const char *arg_selinux_context = NULL;
136 static const char *arg_selinux_apifs_context = NULL;
137 static const char *arg_slice = NULL;
138 static bool arg_private_network = false;
139 static bool arg_read_only = false;
140 static bool arg_boot = false;
141 static bool arg_ephemeral = false;
142 static LinkJournal arg_link_journal = LINK_AUTO;
143 static bool arg_link_journal_try = false;
144 static uint64_t arg_retain =
145         (1ULL << CAP_CHOWN) |
146         (1ULL << CAP_DAC_OVERRIDE) |
147         (1ULL << CAP_DAC_READ_SEARCH) |
148         (1ULL << CAP_FOWNER) |
149         (1ULL << CAP_FSETID) |
150         (1ULL << CAP_IPC_OWNER) |
151         (1ULL << CAP_KILL) |
152         (1ULL << CAP_LEASE) |
153         (1ULL << CAP_LINUX_IMMUTABLE) |
154         (1ULL << CAP_NET_BIND_SERVICE) |
155         (1ULL << CAP_NET_BROADCAST) |
156         (1ULL << CAP_NET_RAW) |
157         (1ULL << CAP_SETGID) |
158         (1ULL << CAP_SETFCAP) |
159         (1ULL << CAP_SETPCAP) |
160         (1ULL << CAP_SETUID) |
161         (1ULL << CAP_SYS_ADMIN) |
162         (1ULL << CAP_SYS_CHROOT) |
163         (1ULL << CAP_SYS_NICE) |
164         (1ULL << CAP_SYS_PTRACE) |
165         (1ULL << CAP_SYS_TTY_CONFIG) |
166         (1ULL << CAP_SYS_RESOURCE) |
167         (1ULL << CAP_SYS_BOOT) |
168         (1ULL << CAP_AUDIT_WRITE) |
169         (1ULL << CAP_AUDIT_CONTROL) |
170         (1ULL << CAP_MKNOD);
171 static char **arg_bind = NULL;
172 static char **arg_bind_ro = NULL;
173 static char **arg_tmpfs = NULL;
174 static char **arg_setenv = NULL;
175 static bool arg_quiet = false;
176 static bool arg_share_system = false;
177 static bool arg_register = true;
178 static bool arg_keep_unit = false;
179 static char **arg_network_interfaces = NULL;
180 static char **arg_network_macvlan = NULL;
181 static bool arg_network_veth = false;
182 static const char *arg_network_bridge = NULL;
183 static unsigned long arg_personality = 0xffffffffLU;
184 static char *arg_image = NULL;
185 static Volatile arg_volatile = VOLATILE_NO;
186 static ExposePort *arg_expose_ports = NULL;
187
188 static void help(void) {
189         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
190                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
191                "  -h --help                 Show this help\n"
192                "     --version              Print version string\n"
193                "  -q --quiet                Do not show status information\n"
194                "  -D --directory=PATH       Root directory for the container\n"
195                "     --template=PATH        Initialize root directory from template directory,\n"
196                "                            if missing\n"
197                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
198                "                            remove it after exit\n"
199                "  -i --image=PATH           File system device or disk image for the container\n"
200                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
201                "  -u --user=USER            Run the command under specified user or uid\n"
202                "  -M --machine=NAME         Set the machine name for the container\n"
203                "     --uuid=UUID            Set a specific machine UUID for the container\n"
204                "  -S --slice=SLICE          Place the container in the specified slice\n"
205                "     --private-network      Disable network in container\n"
206                "     --network-interface=INTERFACE\n"
207                "                            Assign an existing network interface to the\n"
208                "                            container\n"
209                "     --network-macvlan=INTERFACE\n"
210                "                            Create a macvlan network interface based on an\n"
211                "                            existing network interface to the container\n"
212                "     --network-veth         Add a virtual ethernet connection between host\n"
213                "                            and container\n"
214                "     --network-bridge=INTERFACE\n"
215                "                            Add a virtual ethernet connection between host\n"
216                "                            and container and add it to an existing bridge on\n"
217                "                            the host\n"
218                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
219                "                            Expose a container IP port ont the host\n"
220                "  -Z --selinux-context=SECLABEL\n"
221                "                            Set the SELinux security context to be used by\n"
222                "                            processes in the container\n"
223                "  -L --selinux-apifs-context=SECLABEL\n"
224                "                            Set the SELinux security context to be used by\n"
225                "                            API/tmpfs file systems in the container\n"
226                "     --capability=CAP       In addition to the default, retain specified\n"
227                "                            capability\n"
228                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
229                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
230                "                            try-guest, try-host\n"
231                "  -j                        Equivalent to --link-journal=try-guest\n"
232                "     --read-only            Mount the root directory read-only\n"
233                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
234                "                            the container\n"
235                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
236                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
237                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
238                "     --share-system         Share system namespaces with host\n"
239                "     --register=BOOLEAN     Register container as machine\n"
240                "     --keep-unit            Do not register a scope for the machine, reuse\n"
241                "                            the service unit nspawn is running in\n"
242                "     --volatile[=MODE]      Run the system in volatile mode\n"
243                , program_invocation_short_name);
244 }
245
246 static int set_sanitized_path(char **b, const char *path) {
247         char *p;
248
249         assert(b);
250         assert(path);
251
252         p = canonicalize_file_name(path);
253         if (!p) {
254                 if (errno != ENOENT)
255                         return -errno;
256
257                 p = path_make_absolute_cwd(path);
258                 if (!p)
259                         return -ENOMEM;
260         }
261
262         free(*b);
263         *b = path_kill_slashes(p);
264         return 0;
265 }
266
267 static int parse_argv(int argc, char *argv[]) {
268
269         enum {
270                 ARG_VERSION = 0x100,
271                 ARG_PRIVATE_NETWORK,
272                 ARG_UUID,
273                 ARG_READ_ONLY,
274                 ARG_CAPABILITY,
275                 ARG_DROP_CAPABILITY,
276                 ARG_LINK_JOURNAL,
277                 ARG_BIND,
278                 ARG_BIND_RO,
279                 ARG_TMPFS,
280                 ARG_SETENV,
281                 ARG_SHARE_SYSTEM,
282                 ARG_REGISTER,
283                 ARG_KEEP_UNIT,
284                 ARG_NETWORK_INTERFACE,
285                 ARG_NETWORK_MACVLAN,
286                 ARG_NETWORK_VETH,
287                 ARG_NETWORK_BRIDGE,
288                 ARG_PERSONALITY,
289                 ARG_VOLATILE,
290                 ARG_TEMPLATE,
291         };
292
293         static const struct option options[] = {
294                 { "help",                  no_argument,       NULL, 'h'                   },
295                 { "version",               no_argument,       NULL, ARG_VERSION           },
296                 { "directory",             required_argument, NULL, 'D'                   },
297                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
298                 { "ephemeral",             no_argument,       NULL, 'x'                   },
299                 { "user",                  required_argument, NULL, 'u'                   },
300                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
301                 { "boot",                  no_argument,       NULL, 'b'                   },
302                 { "uuid",                  required_argument, NULL, ARG_UUID              },
303                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
304                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
305                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
306                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
307                 { "bind",                  required_argument, NULL, ARG_BIND              },
308                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
309                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
310                 { "machine",               required_argument, NULL, 'M'                   },
311                 { "slice",                 required_argument, NULL, 'S'                   },
312                 { "setenv",                required_argument, NULL, ARG_SETENV            },
313                 { "selinux-context",       required_argument, NULL, 'Z'                   },
314                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
315                 { "quiet",                 no_argument,       NULL, 'q'                   },
316                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
317                 { "register",              required_argument, NULL, ARG_REGISTER          },
318                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
319                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
320                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
321                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
322                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
323                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
324                 { "image",                 required_argument, NULL, 'i'                   },
325                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
326                 { "port",                  required_argument, NULL, 'p'                   },
327                 {}
328         };
329
330         int c, r;
331         uint64_t plus = 0, minus = 0;
332
333         assert(argc >= 0);
334         assert(argv);
335
336         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:", options, NULL)) >= 0)
337
338                 switch (c) {
339
340                 case 'h':
341                         help();
342                         return 0;
343
344                 case ARG_VERSION:
345                         puts(PACKAGE_STRING);
346                         puts(SYSTEMD_FEATURES);
347                         return 0;
348
349                 case 'D':
350                         r = set_sanitized_path(&arg_directory, optarg);
351                         if (r < 0)
352                                 return log_error_errno(r, "Invalid root directory: %m");
353
354                         break;
355
356                 case ARG_TEMPLATE:
357                         r = set_sanitized_path(&arg_template, optarg);
358                         if (r < 0)
359                                 return log_error_errno(r, "Invalid template directory: %m");
360
361                         break;
362
363                 case 'i':
364                         r = set_sanitized_path(&arg_image, optarg);
365                         if (r < 0)
366                                 return log_error_errno(r, "Invalid image path: %m");
367
368                         break;
369
370                 case 'x':
371                         arg_ephemeral = true;
372                         break;
373
374                 case 'u':
375                         free(arg_user);
376                         arg_user = strdup(optarg);
377                         if (!arg_user)
378                                 return log_oom();
379
380                         break;
381
382                 case ARG_NETWORK_BRIDGE:
383                         arg_network_bridge = optarg;
384
385                         /* fall through */
386
387                 case ARG_NETWORK_VETH:
388                         arg_network_veth = true;
389                         arg_private_network = true;
390                         break;
391
392                 case ARG_NETWORK_INTERFACE:
393                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
394                                 return log_oom();
395
396                         arg_private_network = true;
397                         break;
398
399                 case ARG_NETWORK_MACVLAN:
400                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
401                                 return log_oom();
402
403                         /* fall through */
404
405                 case ARG_PRIVATE_NETWORK:
406                         arg_private_network = true;
407                         break;
408
409                 case 'b':
410                         arg_boot = true;
411                         break;
412
413                 case ARG_UUID:
414                         r = sd_id128_from_string(optarg, &arg_uuid);
415                         if (r < 0) {
416                                 log_error("Invalid UUID: %s", optarg);
417                                 return r;
418                         }
419                         break;
420
421                 case 'S':
422                         arg_slice = optarg;
423                         break;
424
425                 case 'M':
426                         if (isempty(optarg)) {
427                                 free(arg_machine);
428                                 arg_machine = NULL;
429                         } else {
430                                 if (!machine_name_is_valid(optarg)) {
431                                         log_error("Invalid machine name: %s", optarg);
432                                         return -EINVAL;
433                                 }
434
435                                 r = free_and_strdup(&arg_machine, optarg);
436                                 if (r < 0)
437                                         return log_oom();
438
439                                 break;
440                         }
441
442                 case 'Z':
443                         arg_selinux_context = optarg;
444                         break;
445
446                 case 'L':
447                         arg_selinux_apifs_context = optarg;
448                         break;
449
450                 case ARG_READ_ONLY:
451                         arg_read_only = true;
452                         break;
453
454                 case ARG_CAPABILITY:
455                 case ARG_DROP_CAPABILITY: {
456                         const char *state, *word;
457                         size_t length;
458
459                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
460                                 _cleanup_free_ char *t;
461
462                                 t = strndup(word, length);
463                                 if (!t)
464                                         return log_oom();
465
466                                 if (streq(t, "all")) {
467                                         if (c == ARG_CAPABILITY)
468                                                 plus = (uint64_t) -1;
469                                         else
470                                                 minus = (uint64_t) -1;
471                                 } else {
472                                         int cap;
473
474                                         cap = capability_from_name(t);
475                                         if (cap < 0) {
476                                                 log_error("Failed to parse capability %s.", t);
477                                                 return -EINVAL;
478                                         }
479
480                                         if (c == ARG_CAPABILITY)
481                                                 plus |= 1ULL << (uint64_t) cap;
482                                         else
483                                                 minus |= 1ULL << (uint64_t) cap;
484                                 }
485                         }
486
487                         break;
488                 }
489
490                 case 'j':
491                         arg_link_journal = LINK_GUEST;
492                         arg_link_journal_try = true;
493                         break;
494
495                 case ARG_LINK_JOURNAL:
496                         if (streq(optarg, "auto")) {
497                                 arg_link_journal = LINK_AUTO;
498                                 arg_link_journal_try = false;
499                         } else if (streq(optarg, "no")) {
500                                 arg_link_journal = LINK_NO;
501                                 arg_link_journal_try = false;
502                         } else if (streq(optarg, "guest")) {
503                                 arg_link_journal = LINK_GUEST;
504                                 arg_link_journal_try = false;
505                         } else if (streq(optarg, "host")) {
506                                 arg_link_journal = LINK_HOST;
507                                 arg_link_journal_try = false;
508                         } else if (streq(optarg, "try-guest")) {
509                                 arg_link_journal = LINK_GUEST;
510                                 arg_link_journal_try = true;
511                         } else if (streq(optarg, "try-host")) {
512                                 arg_link_journal = LINK_HOST;
513                                 arg_link_journal_try = true;
514                         } else {
515                                 log_error("Failed to parse link journal mode %s", optarg);
516                                 return -EINVAL;
517                         }
518
519                         break;
520
521                 case ARG_BIND:
522                 case ARG_BIND_RO: {
523                         _cleanup_free_ char *a = NULL, *b = NULL;
524                         char *e;
525                         char ***x;
526
527                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
528
529                         e = strchr(optarg, ':');
530                         if (e) {
531                                 a = strndup(optarg, e - optarg);
532                                 b = strdup(e + 1);
533                         } else {
534                                 a = strdup(optarg);
535                                 b = strdup(optarg);
536                         }
537
538                         if (!a || !b)
539                                 return log_oom();
540
541                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
542                                 log_error("Invalid bind mount specification: %s", optarg);
543                                 return -EINVAL;
544                         }
545
546                         r = strv_extend(x, a);
547                         if (r < 0)
548                                 return log_oom();
549
550                         r = strv_extend(x, b);
551                         if (r < 0)
552                                 return log_oom();
553
554                         break;
555                 }
556
557                 case ARG_TMPFS: {
558                         _cleanup_free_ char *a = NULL, *b = NULL;
559                         char *e;
560
561                         e = strchr(optarg, ':');
562                         if (e) {
563                                 a = strndup(optarg, e - optarg);
564                                 b = strdup(e + 1);
565                         } else {
566                                 a = strdup(optarg);
567                                 b = strdup("mode=0755");
568                         }
569
570                         if (!a || !b)
571                                 return log_oom();
572
573                         if (!path_is_absolute(a)) {
574                                 log_error("Invalid tmpfs specification: %s", optarg);
575                                 return -EINVAL;
576                         }
577
578                         r = strv_push(&arg_tmpfs, a);
579                         if (r < 0)
580                                 return log_oom();
581
582                         a = NULL;
583
584                         r = strv_push(&arg_tmpfs, b);
585                         if (r < 0)
586                                 return log_oom();
587
588                         b = NULL;
589
590                         break;
591                 }
592
593                 case ARG_SETENV: {
594                         char **n;
595
596                         if (!env_assignment_is_valid(optarg)) {
597                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
598                                 return -EINVAL;
599                         }
600
601                         n = strv_env_set(arg_setenv, optarg);
602                         if (!n)
603                                 return log_oom();
604
605                         strv_free(arg_setenv);
606                         arg_setenv = n;
607                         break;
608                 }
609
610                 case 'q':
611                         arg_quiet = true;
612                         break;
613
614                 case ARG_SHARE_SYSTEM:
615                         arg_share_system = true;
616                         break;
617
618                 case ARG_REGISTER:
619                         r = parse_boolean(optarg);
620                         if (r < 0) {
621                                 log_error("Failed to parse --register= argument: %s", optarg);
622                                 return r;
623                         }
624
625                         arg_register = r;
626                         break;
627
628                 case ARG_KEEP_UNIT:
629                         arg_keep_unit = true;
630                         break;
631
632                 case ARG_PERSONALITY:
633
634                         arg_personality = personality_from_string(optarg);
635                         if (arg_personality == 0xffffffffLU) {
636                                 log_error("Unknown or unsupported personality '%s'.", optarg);
637                                 return -EINVAL;
638                         }
639
640                         break;
641
642                 case ARG_VOLATILE:
643
644                         if (!optarg)
645                                 arg_volatile = VOLATILE_YES;
646                         else {
647                                 r = parse_boolean(optarg);
648                                 if (r < 0) {
649                                         if (streq(optarg, "state"))
650                                                 arg_volatile = VOLATILE_STATE;
651                                         else {
652                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
653                                                 return r;
654                                         }
655                                 } else
656                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
657                         }
658
659                         break;
660
661                 case 'p': {
662                         const char *split, *e;
663                         uint16_t container_port, host_port;
664                         int protocol;
665                         ExposePort *p;
666
667                         if ((e = startswith(optarg, "tcp:")))
668                                 protocol = IPPROTO_TCP;
669                         else if ((e = startswith(optarg, "udp:")))
670                                 protocol = IPPROTO_UDP;
671                         else {
672                                 e = optarg;
673                                 protocol = IPPROTO_TCP;
674                         }
675
676                         split = strchr(e, ':');
677                         if (split) {
678                                 char v[split - e + 1];
679
680                                 memcpy(v, e, split - e);
681                                 v[split - e] = 0;
682
683                                 r = safe_atou16(v, &host_port);
684                                 if (r < 0 || host_port <= 0) {
685                                         log_error("Failed to parse host port: %s", optarg);
686                                         return -EINVAL;
687                                 }
688
689                                 r = safe_atou16(split + 1, &container_port);
690                         } else {
691                                 r = safe_atou16(e, &container_port);
692                                 host_port = container_port;
693                         }
694
695                         if (r < 0 || container_port <= 0) {
696                                 log_error("Failed to parse host port: %s", optarg);
697                                 return -EINVAL;
698                         }
699
700                         LIST_FOREACH(ports, p, arg_expose_ports) {
701                                 if (p->protocol == protocol && p->host_port == host_port) {
702                                         log_error("Duplicate port specification: %s", optarg);
703                                         return -EINVAL;
704                                 }
705                         }
706
707                         p = new(ExposePort, 1);
708                         if (!p)
709                                 return log_oom();
710
711                         p->protocol = protocol;
712                         p->host_port = host_port;
713                         p->container_port = container_port;
714
715                         LIST_PREPEND(ports, arg_expose_ports, p);
716
717                         break;
718                 }
719
720                 case '?':
721                         return -EINVAL;
722
723                 default:
724                         assert_not_reached("Unhandled option");
725                 }
726
727         if (arg_share_system)
728                 arg_register = false;
729
730         if (arg_boot && arg_share_system) {
731                 log_error("--boot and --share-system may not be combined.");
732                 return -EINVAL;
733         }
734
735         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
736                 log_error("--keep-unit may not be used when invoked from a user session.");
737                 return -EINVAL;
738         }
739
740         if (arg_directory && arg_image) {
741                 log_error("--directory= and --image= may not be combined.");
742                 return -EINVAL;
743         }
744
745         if (arg_template && arg_image) {
746                 log_error("--template= and --image= may not be combined.");
747                 return -EINVAL;
748         }
749
750         if (arg_template && !(arg_directory || arg_machine)) {
751                 log_error("--template= needs --directory= or --machine=.");
752                 return -EINVAL;
753         }
754
755         if (arg_ephemeral && arg_template) {
756                 log_error("--ephemeral and --template= may not be combined.");
757                 return -EINVAL;
758         }
759
760         if (arg_ephemeral && arg_image) {
761                 log_error("--ephemeral and --image= may not be combined.");
762                 return -EINVAL;
763         }
764
765         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
766                 log_error("--ephemeral and --link-journal= may not be combined.");
767                 return -EINVAL;
768         }
769
770         if (arg_volatile != VOLATILE_NO && arg_read_only) {
771                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
772                 return -EINVAL;
773         }
774
775         if (arg_expose_ports && !arg_private_network) {
776                 log_error("Cannot use --port= without private networking.");
777                 return -EINVAL;
778         }
779
780         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
781
782         return 1;
783 }
784
785 static int mount_all(const char *dest) {
786
787         typedef struct MountPoint {
788                 const char *what;
789                 const char *where;
790                 const char *type;
791                 const char *options;
792                 unsigned long flags;
793                 bool fatal;
794         } MountPoint;
795
796         static const MountPoint mount_table[] = {
797                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
798                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
799                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
800                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
801                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
802                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
803                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
804                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
805 #ifdef HAVE_SELINUX
806                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
807                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
808 #endif
809         };
810
811         unsigned k;
812         int r = 0;
813
814         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
815                 _cleanup_free_ char *where = NULL;
816 #ifdef HAVE_SELINUX
817                 _cleanup_free_ char *options = NULL;
818 #endif
819                 const char *o;
820                 int t;
821
822                 where = strjoin(dest, "/", mount_table[k].where, NULL);
823                 if (!where)
824                         return log_oom();
825
826                 t = path_is_mount_point(where, true);
827                 if (t < 0) {
828                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
829
830                         if (r == 0)
831                                 r = t;
832
833                         continue;
834                 }
835
836                 /* Skip this entry if it is not a remount. */
837                 if (mount_table[k].what && t > 0)
838                         continue;
839
840                 t = mkdir_p(where, 0755);
841                 if (t < 0) {
842                         if (mount_table[k].fatal) {
843                                log_error_errno(t, "Failed to create directory %s: %m", where);
844
845                                 if (r == 0)
846                                         r = t;
847                         } else
848                                log_warning_errno(t, "Failed to create directory %s: %m", where);
849
850                         continue;
851                 }
852
853 #ifdef HAVE_SELINUX
854                 if (arg_selinux_apifs_context &&
855                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
856                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
857                         if (!options)
858                                 return log_oom();
859
860                         o = options;
861                 } else
862 #endif
863                         o = mount_table[k].options;
864
865
866                 if (mount(mount_table[k].what,
867                           where,
868                           mount_table[k].type,
869                           mount_table[k].flags,
870                           o) < 0) {
871
872                         if (mount_table[k].fatal) {
873                                 log_error_errno(errno, "mount(%s) failed: %m", where);
874
875                                 if (r == 0)
876                                         r = -errno;
877                         } else
878                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
879                 }
880         }
881
882         return r;
883 }
884
885 static int mount_binds(const char *dest, char **l, bool ro) {
886         char **x, **y;
887
888         STRV_FOREACH_PAIR(x, y, l) {
889                 _cleanup_free_ char *where = NULL;
890                 struct stat source_st, dest_st;
891                 int r;
892
893                 if (stat(*x, &source_st) < 0)
894                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
895
896                 where = strappend(dest, *y);
897                 if (!where)
898                         return log_oom();
899
900                 r = stat(where, &dest_st);
901                 if (r == 0) {
902                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
903                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
904                                 return -EINVAL;
905                         }
906                 } else if (errno == ENOENT) {
907                         r = mkdir_parents_label(where, 0755);
908                         if (r < 0)
909                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
910                 } else {
911                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
912                         return -errno;
913                 }
914
915                 /* Create the mount point, but be conservative -- refuse to create block
916                  * and char devices. */
917                 if (S_ISDIR(source_st.st_mode)) {
918                         r = mkdir_label(where, 0755);
919                         if (r < 0 && errno != EEXIST)
920                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
921                 } else if (S_ISFIFO(source_st.st_mode)) {
922                         r = mkfifo(where, 0644);
923                         if (r < 0 && errno != EEXIST)
924                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
925                 } else if (S_ISSOCK(source_st.st_mode)) {
926                         r = mknod(where, 0644 | S_IFSOCK, 0);
927                         if (r < 0 && errno != EEXIST)
928                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
929                 } else if (S_ISREG(source_st.st_mode)) {
930                         r = touch(where);
931                         if (r < 0)
932                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
933                 } else {
934                         log_error("Refusing to create mountpoint for file: %s", *x);
935                         return -ENOTSUP;
936                 }
937
938                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
939                         return log_error_errno(errno, "mount(%s) failed: %m", where);
940
941                 if (ro) {
942                         r = bind_remount_recursive(where, true);
943                         if (r < 0)
944                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
945                 }
946         }
947
948         return 0;
949 }
950
951 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
952         char *to;
953         int r;
954
955         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
956
957         r = path_is_mount_point(to, false);
958         if (r < 0)
959                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
960         if (r > 0)
961                 return 0;
962
963         mkdir_p(to, 0755);
964
965         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
966                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
967
968         return 1;
969 }
970
971 static int mount_cgroup(const char *dest) {
972         _cleanup_set_free_free_ Set *controllers = NULL;
973         _cleanup_free_ char *own_cgroup_path = NULL;
974         const char *cgroup_root, *systemd_root, *systemd_own;
975         int r;
976
977         controllers = set_new(&string_hash_ops);
978         if (!controllers)
979                 return log_oom();
980
981         r = cg_kernel_controllers(controllers);
982         if (r < 0)
983                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
984
985         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
986         if (r < 0)
987                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
988
989         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
990         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
991                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
992
993         for (;;) {
994                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
995
996                 controller = set_steal_first(controllers);
997                 if (!controller)
998                         break;
999
1000                 origin = strappend("/sys/fs/cgroup/", controller);
1001                 if (!origin)
1002                         return log_oom();
1003
1004                 r = readlink_malloc(origin, &combined);
1005                 if (r == -EINVAL) {
1006                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1007
1008                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1009                         if (r < 0)
1010                                 return r;
1011
1012                 } else if (r < 0)
1013                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1014                 else {
1015                         _cleanup_free_ char *target = NULL;
1016
1017                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1018                         if (!target)
1019                                 return log_oom();
1020
1021                         /* A symbolic link, a combination of controllers in one hierarchy */
1022
1023                         if (!filename_is_valid(combined)) {
1024                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1025                                 continue;
1026                         }
1027
1028                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1029                         if (r < 0)
1030                                 return r;
1031
1032                         if (symlink(combined, target) < 0)
1033                                 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
1034                 }
1035         }
1036
1037         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1038         if (r < 0)
1039                 return r;
1040
1041         /* Make our own cgroup a (writable) bind mount */
1042         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1043         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1044                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1045
1046         /* And then remount the systemd cgroup root read-only */
1047         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1048         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1049                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1050
1051         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1052                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1053
1054         return 0;
1055 }
1056
1057 static int mount_tmpfs(const char *dest) {
1058         char **i, **o;
1059
1060         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1061                 _cleanup_free_ char *where = NULL;
1062                 int r;
1063
1064                 where = strappend(dest, *i);
1065                 if (!where)
1066                         return log_oom();
1067
1068                 r = mkdir_label(where, 0755);
1069                 if (r < 0 && r != -EEXIST)
1070                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1071
1072                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1073                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1074         }
1075
1076         return 0;
1077 }
1078
1079 static int setup_timezone(const char *dest) {
1080         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1081         char *z, *y;
1082         int r;
1083
1084         assert(dest);
1085
1086         /* Fix the timezone, if possible */
1087         r = readlink_malloc("/etc/localtime", &p);
1088         if (r < 0) {
1089                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1090                 return 0;
1091         }
1092
1093         z = path_startswith(p, "../usr/share/zoneinfo/");
1094         if (!z)
1095                 z = path_startswith(p, "/usr/share/zoneinfo/");
1096         if (!z) {
1097                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1098                 return 0;
1099         }
1100
1101         where = strappend(dest, "/etc/localtime");
1102         if (!where)
1103                 return log_oom();
1104
1105         r = readlink_malloc(where, &q);
1106         if (r >= 0) {
1107                 y = path_startswith(q, "../usr/share/zoneinfo/");
1108                 if (!y)
1109                         y = path_startswith(q, "/usr/share/zoneinfo/");
1110
1111                 /* Already pointing to the right place? Then do nothing .. */
1112                 if (y && streq(y, z))
1113                         return 0;
1114         }
1115
1116         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1117         if (!check)
1118                 return log_oom();
1119
1120         if (access(check, F_OK) < 0) {
1121                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1122                 return 0;
1123         }
1124
1125         what = strappend("../usr/share/zoneinfo/", z);
1126         if (!what)
1127                 return log_oom();
1128
1129         r = mkdir_parents(where, 0755);
1130         if (r < 0) {
1131                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1132
1133                 return 0;
1134         }
1135
1136         r = unlink(where);
1137         if (r < 0 && errno != ENOENT) {
1138                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1139
1140                 return 0;
1141         }
1142
1143         if (symlink(what, where) < 0) {
1144                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1145                 return 0;
1146         }
1147
1148         return 0;
1149 }
1150
1151 static int setup_resolv_conf(const char *dest) {
1152         _cleanup_free_ char *where = NULL;
1153         int r;
1154
1155         assert(dest);
1156
1157         if (arg_private_network)
1158                 return 0;
1159
1160         /* Fix resolv.conf, if possible */
1161         where = strappend(dest, "/etc/resolv.conf");
1162         if (!where)
1163                 return log_oom();
1164
1165         /* We don't really care for the results of this really. If it
1166          * fails, it fails, but meh... */
1167         r = mkdir_parents(where, 0755);
1168         if (r < 0) {
1169                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1170
1171                 return 0;
1172         }
1173
1174         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1175         if (r < 0) {
1176                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1177
1178                 return 0;
1179         }
1180
1181         return 0;
1182 }
1183
1184 static int setup_volatile_state(const char *directory) {
1185         const char *p;
1186         int r;
1187
1188         assert(directory);
1189
1190         if (arg_volatile != VOLATILE_STATE)
1191                 return 0;
1192
1193         /* --volatile=state means we simply overmount /var
1194            with a tmpfs, and the rest read-only. */
1195
1196         r = bind_remount_recursive(directory, true);
1197         if (r < 0)
1198                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1199
1200         p = strappenda(directory, "/var");
1201         r = mkdir(p, 0755);
1202         if (r < 0 && errno != EEXIST)
1203                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1204
1205         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1206                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1207
1208         return 0;
1209 }
1210
1211 static int setup_volatile(const char *directory) {
1212         bool tmpfs_mounted = false, bind_mounted = false;
1213         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1214         const char *f, *t;
1215         int r;
1216
1217         assert(directory);
1218
1219         if (arg_volatile != VOLATILE_YES)
1220                 return 0;
1221
1222         /* --volatile=yes means we mount a tmpfs to the root dir, and
1223            the original /usr to use inside it, and that read-only. */
1224
1225         if (!mkdtemp(template))
1226                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1227
1228         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1229                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1230                 r = -errno;
1231                 goto fail;
1232         }
1233
1234         tmpfs_mounted = true;
1235
1236         f = strappenda(directory, "/usr");
1237         t = strappenda(template, "/usr");
1238
1239         r = mkdir(t, 0755);
1240         if (r < 0 && errno != EEXIST) {
1241                 log_error_errno(errno, "Failed to create %s: %m", t);
1242                 r = -errno;
1243                 goto fail;
1244         }
1245
1246         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1247                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1248                 r = -errno;
1249                 goto fail;
1250         }
1251
1252         bind_mounted = true;
1253
1254         r = bind_remount_recursive(t, true);
1255         if (r < 0) {
1256                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1257                 goto fail;
1258         }
1259
1260         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1261                 log_error_errno(errno, "Failed to move root mount: %m");
1262                 r = -errno;
1263                 goto fail;
1264         }
1265
1266         rmdir(template);
1267
1268         return 0;
1269
1270 fail:
1271         if (bind_mounted)
1272                 umount(t);
1273         if (tmpfs_mounted)
1274                 umount(template);
1275         rmdir(template);
1276         return r;
1277 }
1278
1279 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1280
1281         snprintf(s, 37,
1282                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1283                  SD_ID128_FORMAT_VAL(id));
1284
1285         return s;
1286 }
1287
1288 static int setup_boot_id(const char *dest) {
1289         _cleanup_free_ char *from = NULL, *to = NULL;
1290         sd_id128_t rnd = {};
1291         char as_uuid[37];
1292         int r;
1293
1294         assert(dest);
1295
1296         if (arg_share_system)
1297                 return 0;
1298
1299         /* Generate a new randomized boot ID, so that each boot-up of
1300          * the container gets a new one */
1301
1302         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1303         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1304         if (!from || !to)
1305                 return log_oom();
1306
1307         r = sd_id128_randomize(&rnd);
1308         if (r < 0)
1309                 return log_error_errno(r, "Failed to generate random boot id: %m");
1310
1311         id128_format_as_uuid(rnd, as_uuid);
1312
1313         r = write_string_file(from, as_uuid);
1314         if (r < 0)
1315                 return log_error_errno(r, "Failed to write boot id: %m");
1316
1317         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1318                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1319                 r = -errno;
1320         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1321                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1322
1323         unlink(from);
1324         return r;
1325 }
1326
1327 static int copy_devnodes(const char *dest) {
1328
1329         static const char devnodes[] =
1330                 "null\0"
1331                 "zero\0"
1332                 "full\0"
1333                 "random\0"
1334                 "urandom\0"
1335                 "tty\0"
1336                 "net/tun\0";
1337
1338         const char *d;
1339         int r = 0;
1340         _cleanup_umask_ mode_t u;
1341
1342         assert(dest);
1343
1344         u = umask(0000);
1345
1346         NULSTR_FOREACH(d, devnodes) {
1347                 _cleanup_free_ char *from = NULL, *to = NULL;
1348                 struct stat st;
1349
1350                 from = strappend("/dev/", d);
1351                 to = strjoin(dest, "/dev/", d, NULL);
1352                 if (!from || !to)
1353                         return log_oom();
1354
1355                 if (stat(from, &st) < 0) {
1356
1357                         if (errno != ENOENT)
1358                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1359
1360                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1361
1362                         log_error("%s is not a char or block device, cannot copy", from);
1363                         return -EIO;
1364
1365                 } else {
1366                         r = mkdir_parents(to, 0775);
1367                         if (r < 0) {
1368                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1369                                 return -r;
1370                         }
1371
1372                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1373                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1374                 }
1375         }
1376
1377         return r;
1378 }
1379
1380 static int setup_ptmx(const char *dest) {
1381         _cleanup_free_ char *p = NULL;
1382
1383         p = strappend(dest, "/dev/ptmx");
1384         if (!p)
1385                 return log_oom();
1386
1387         if (symlink("pts/ptmx", p) < 0)
1388                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1389
1390         return 0;
1391 }
1392
1393 static int setup_dev_console(const char *dest, const char *console) {
1394         _cleanup_umask_ mode_t u;
1395         const char *to;
1396         struct stat st;
1397         int r;
1398
1399         assert(dest);
1400         assert(console);
1401
1402         u = umask(0000);
1403
1404         if (stat("/dev/null", &st) < 0)
1405                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1406
1407         r = chmod_and_chown(console, 0600, 0, 0);
1408         if (r < 0)
1409                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1410
1411         /* We need to bind mount the right tty to /dev/console since
1412          * ptys can only exist on pts file systems. To have something
1413          * to bind mount things on we create a device node first, and
1414          * use /dev/null for that since we the cgroups device policy
1415          * allows us to create that freely, while we cannot create
1416          * /dev/console. (Note that the major minor doesn't actually
1417          * matter here, since we mount it over anyway). */
1418
1419         to = strappenda(dest, "/dev/console");
1420         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1421                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1422
1423         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1424                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1425
1426         return 0;
1427 }
1428
1429 static int setup_kmsg(const char *dest, int kmsg_socket) {
1430         _cleanup_free_ char *from = NULL, *to = NULL;
1431         _cleanup_umask_ mode_t u;
1432         int r, fd, k;
1433         union {
1434                 struct cmsghdr cmsghdr;
1435                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1436         } control = {};
1437         struct msghdr mh = {
1438                 .msg_control = &control,
1439                 .msg_controllen = sizeof(control),
1440         };
1441         struct cmsghdr *cmsg;
1442
1443         assert(dest);
1444         assert(kmsg_socket >= 0);
1445
1446         u = umask(0000);
1447
1448         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1449          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1450          * on the reading side behave very similar to /proc/kmsg,
1451          * their writing side behaves differently from /dev/kmsg in
1452          * that writing blocks when nothing is reading. In order to
1453          * avoid any problems with containers deadlocking due to this
1454          * we simply make /dev/kmsg unavailable to the container. */
1455         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1456             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1457                 return log_oom();
1458
1459         if (mkfifo(from, 0600) < 0)
1460                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1461
1462         r = chmod_and_chown(from, 0600, 0, 0);
1463         if (r < 0)
1464                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1465
1466         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1467                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1468
1469         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1470         if (fd < 0)
1471                 return log_error_errno(errno, "Failed to open fifo: %m");
1472
1473         cmsg = CMSG_FIRSTHDR(&mh);
1474         cmsg->cmsg_level = SOL_SOCKET;
1475         cmsg->cmsg_type = SCM_RIGHTS;
1476         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1477         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1478
1479         mh.msg_controllen = cmsg->cmsg_len;
1480
1481         /* Store away the fd in the socket, so that it stays open as
1482          * long as we run the child */
1483         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1484         safe_close(fd);
1485
1486         if (k < 0)
1487                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1488
1489         /* And now make the FIFO unavailable as /dev/kmsg... */
1490         unlink(from);
1491         return 0;
1492 }
1493
1494 static int send_rtnl(int send_fd) {
1495         union {
1496                 struct cmsghdr cmsghdr;
1497                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1498         } control = {};
1499         struct msghdr mh = {
1500                 .msg_control = &control,
1501                 .msg_controllen = sizeof(control),
1502         };
1503         struct cmsghdr *cmsg;
1504         _cleanup_close_ int fd = -1;
1505         ssize_t k;
1506
1507         assert(send_fd >= 0);
1508
1509         if (!arg_expose_ports)
1510                 return 0;
1511
1512         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1513         if (fd < 0)
1514                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1515
1516         cmsg = CMSG_FIRSTHDR(&mh);
1517         cmsg->cmsg_level = SOL_SOCKET;
1518         cmsg->cmsg_type = SCM_RIGHTS;
1519         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1520         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1521
1522         mh.msg_controllen = cmsg->cmsg_len;
1523
1524         /* Store away the fd in the socket, so that it stays open as
1525          * long as we run the child */
1526         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1527         if (k < 0)
1528                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1529
1530         return 0;
1531 }
1532
1533 static int flush_ports(union in_addr_union *exposed) {
1534         ExposePort *p;
1535         int r, af = AF_INET;
1536
1537         assert(exposed);
1538
1539         if (!arg_expose_ports)
1540                 return 0;
1541
1542         if (in_addr_is_null(af, exposed))
1543                 return 0;
1544
1545         log_debug("Lost IP address.");
1546
1547         LIST_FOREACH(ports, p, arg_expose_ports) {
1548                 r = fw_add_local_dnat(false,
1549                                       af,
1550                                       p->protocol,
1551                                       NULL,
1552                                       NULL, 0,
1553                                       NULL, 0,
1554                                       p->host_port,
1555                                       exposed,
1556                                       p->container_port,
1557                                       NULL);
1558                 if (r < 0)
1559                         log_warning_errno(r, "Failed to modify firewall: %m");
1560         }
1561
1562         *exposed = IN_ADDR_NULL;
1563         return 0;
1564 }
1565
1566 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1567         _cleanup_free_ struct local_address *addresses = NULL;
1568         _cleanup_free_ char *pretty = NULL;
1569         union in_addr_union new_exposed;
1570         ExposePort *p;
1571         bool add;
1572         int af = AF_INET, r;
1573
1574         assert(exposed);
1575
1576         /* Invoked each time an address is added or removed inside the
1577          * container */
1578
1579         if (!arg_expose_ports)
1580                 return 0;
1581
1582         r = local_addresses(rtnl, 0, af, &addresses);
1583         if (r < 0)
1584                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1585
1586         add = r > 0 &&
1587                 addresses[0].family == af &&
1588                 addresses[0].scope < RT_SCOPE_LINK;
1589
1590         if (!add)
1591                 return flush_ports(exposed);
1592
1593         new_exposed = addresses[0].address;
1594         if (in_addr_equal(af, exposed, &new_exposed))
1595                 return 0;
1596
1597         in_addr_to_string(af, &new_exposed, &pretty);
1598         log_debug("New container IP is %s.", strna(pretty));
1599
1600         LIST_FOREACH(ports, p, arg_expose_ports) {
1601
1602                 r = fw_add_local_dnat(true,
1603                                       af,
1604                                       p->protocol,
1605                                       NULL,
1606                                       NULL, 0,
1607                                       NULL, 0,
1608                                       p->host_port,
1609                                       &new_exposed,
1610                                       p->container_port,
1611                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1612                 if (r < 0)
1613                         log_warning_errno(r, "Failed to modify firewall: %m");
1614         }
1615
1616         *exposed = new_exposed;
1617         return 0;
1618 }
1619
1620 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1621         union in_addr_union *exposed = userdata;
1622
1623         assert(rtnl);
1624         assert(m);
1625         assert(exposed);
1626
1627         expose_ports(rtnl, exposed);
1628         return 0;
1629 }
1630
1631 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1632         union {
1633                 struct cmsghdr cmsghdr;
1634                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1635         } control = {};
1636         struct msghdr mh = {
1637                 .msg_control = &control,
1638                 .msg_controllen = sizeof(control),
1639         };
1640         struct cmsghdr *cmsg;
1641         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1642         int fd, r;
1643         ssize_t k;
1644
1645         assert(event);
1646         assert(recv_fd >= 0);
1647         assert(ret);
1648
1649         if (!arg_expose_ports)
1650                 return 0;
1651
1652         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1653         if (k < 0)
1654                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1655
1656         cmsg = CMSG_FIRSTHDR(&mh);
1657         assert(cmsg->cmsg_level == SOL_SOCKET);
1658         assert(cmsg->cmsg_type == SCM_RIGHTS);
1659         assert(cmsg->cmsg_len = CMSG_LEN(sizeof(int)));
1660         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1661
1662         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1663         if (r < 0) {
1664                 safe_close(fd);
1665                 return log_error_errno(r, "Failed to create rtnl object: %m");
1666         }
1667
1668         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1669         if (r < 0)
1670                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1671
1672         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1673         if (r < 0)
1674                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1675
1676         r = sd_rtnl_attach_event(rtnl, event, 0);
1677         if (r < 0)
1678                 return log_error_errno(r, "Failed to add to even loop: %m");
1679
1680         *ret = rtnl;
1681         rtnl = NULL;
1682
1683         return 0;
1684 }
1685
1686 static int setup_hostname(void) {
1687
1688         if (arg_share_system)
1689                 return 0;
1690
1691         if (sethostname_idempotent(arg_machine) < 0)
1692                 return -errno;
1693
1694         return 0;
1695 }
1696
1697 static int setup_journal(const char *directory) {
1698         sd_id128_t machine_id, this_id;
1699         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1700         char *id;
1701         int r;
1702
1703         /* Don't link journals in ephemeral mode */
1704         if (arg_ephemeral)
1705                 return 0;
1706
1707         p = strappend(directory, "/etc/machine-id");
1708         if (!p)
1709                 return log_oom();
1710
1711         r = read_one_line_file(p, &b);
1712         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1713                 return 0;
1714         else if (r < 0)
1715                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1716
1717         id = strstrip(b);
1718         if (isempty(id) && arg_link_journal == LINK_AUTO)
1719                 return 0;
1720
1721         /* Verify validity */
1722         r = sd_id128_from_string(id, &machine_id);
1723         if (r < 0)
1724                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1725
1726         r = sd_id128_get_machine(&this_id);
1727         if (r < 0)
1728                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1729
1730         if (sd_id128_equal(machine_id, this_id)) {
1731                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1732                          "Host and machine ids are equal (%s): refusing to link journals", id);
1733                 if (arg_link_journal == LINK_AUTO)
1734                         return 0;
1735                 return -EEXIST;
1736         }
1737
1738         if (arg_link_journal == LINK_NO)
1739                 return 0;
1740
1741         free(p);
1742         p = strappend("/var/log/journal/", id);
1743         q = strjoin(directory, "/var/log/journal/", id, NULL);
1744         if (!p || !q)
1745                 return log_oom();
1746
1747         if (path_is_mount_point(p, false) > 0) {
1748                 if (arg_link_journal != LINK_AUTO) {
1749                         log_error("%s: already a mount point, refusing to use for journal", p);
1750                         return -EEXIST;
1751                 }
1752
1753                 return 0;
1754         }
1755
1756         if (path_is_mount_point(q, false) > 0) {
1757                 if (arg_link_journal != LINK_AUTO) {
1758                         log_error("%s: already a mount point, refusing to use for journal", q);
1759                         return -EEXIST;
1760                 }
1761
1762                 return 0;
1763         }
1764
1765         r = readlink_and_make_absolute(p, &d);
1766         if (r >= 0) {
1767                 if ((arg_link_journal == LINK_GUEST ||
1768                      arg_link_journal == LINK_AUTO) &&
1769                     path_equal(d, q)) {
1770
1771                         r = mkdir_p(q, 0755);
1772                         if (r < 0)
1773                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1774                         return 0;
1775                 }
1776
1777                 if (unlink(p) < 0)
1778                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1779         } else if (r == -EINVAL) {
1780
1781                 if (arg_link_journal == LINK_GUEST &&
1782                     rmdir(p) < 0) {
1783
1784                         if (errno == ENOTDIR) {
1785                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1786                                 return r;
1787                         } else {
1788                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1789                                 return -errno;
1790                         }
1791                 }
1792         } else if (r != -ENOENT) {
1793                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1794                 return r;
1795         }
1796
1797         if (arg_link_journal == LINK_GUEST) {
1798
1799                 if (symlink(q, p) < 0) {
1800                         if (arg_link_journal_try) {
1801                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1802                                 return 0;
1803                         } else {
1804                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1805                                 return -errno;
1806                         }
1807                 }
1808
1809                 r = mkdir_p(q, 0755);
1810                 if (r < 0)
1811                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1812                 return 0;
1813         }
1814
1815         if (arg_link_journal == LINK_HOST) {
1816                 /* don't create parents here -- if the host doesn't have
1817                  * permanent journal set up, don't force it here */
1818                 r = mkdir(p, 0755);
1819                 if (r < 0) {
1820                         if (arg_link_journal_try) {
1821                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1822                                 return 0;
1823                         } else {
1824                                 log_error_errno(errno, "Failed to create %s: %m", p);
1825                                 return r;
1826                         }
1827                 }
1828
1829         } else if (access(p, F_OK) < 0)
1830                 return 0;
1831
1832         if (dir_is_empty(q) == 0)
1833                 log_warning("%s is not empty, proceeding anyway.", q);
1834
1835         r = mkdir_p(q, 0755);
1836         if (r < 0) {
1837                 log_error_errno(errno, "Failed to create %s: %m", q);
1838                 return r;
1839         }
1840
1841         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1842                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1843
1844         return 0;
1845 }
1846
1847 static int drop_capabilities(void) {
1848         return capability_bounding_set_drop(~arg_retain, false);
1849 }
1850
1851 static int register_machine(pid_t pid, int local_ifindex) {
1852         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1853         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1854         int r;
1855
1856         if (!arg_register)
1857                 return 0;
1858
1859         r = sd_bus_default_system(&bus);
1860         if (r < 0)
1861                 return log_error_errno(r, "Failed to open system bus: %m");
1862
1863         if (arg_keep_unit) {
1864                 r = sd_bus_call_method(
1865                                 bus,
1866                                 "org.freedesktop.machine1",
1867                                 "/org/freedesktop/machine1",
1868                                 "org.freedesktop.machine1.Manager",
1869                                 "RegisterMachineWithNetwork",
1870                                 &error,
1871                                 NULL,
1872                                 "sayssusai",
1873                                 arg_machine,
1874                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1875                                 "nspawn",
1876                                 "container",
1877                                 (uint32_t) pid,
1878                                 strempty(arg_directory),
1879                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1880         } else {
1881                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1882
1883                 r = sd_bus_message_new_method_call(
1884                                 bus,
1885                                 &m,
1886                                 "org.freedesktop.machine1",
1887                                 "/org/freedesktop/machine1",
1888                                 "org.freedesktop.machine1.Manager",
1889                                 "CreateMachineWithNetwork");
1890                 if (r < 0)
1891                         return log_error_errno(r, "Failed to create message: %m");
1892
1893                 r = sd_bus_message_append(
1894                                 m,
1895                                 "sayssusai",
1896                                 arg_machine,
1897                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1898                                 "nspawn",
1899                                 "container",
1900                                 (uint32_t) pid,
1901                                 strempty(arg_directory),
1902                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1903                 if (r < 0)
1904                         return log_error_errno(r, "Failed to append message arguments: %m");
1905
1906                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1907                 if (r < 0)
1908                         return log_error_errno(r, "Failed to open container: %m");
1909
1910                 if (!isempty(arg_slice)) {
1911                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1912                         if (r < 0)
1913                                 return log_error_errno(r, "Failed to append slice: %m");
1914                 }
1915
1916                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1917                 if (r < 0)
1918                         return log_error_errno(r, "Failed to add device policy: %m");
1919
1920                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1921                                           /* Allow the container to
1922                                            * access and create the API
1923                                            * device nodes, so that
1924                                            * PrivateDevices= in the
1925                                            * container can work
1926                                            * fine */
1927                                           "/dev/null", "rwm",
1928                                           "/dev/zero", "rwm",
1929                                           "/dev/full", "rwm",
1930                                           "/dev/random", "rwm",
1931                                           "/dev/urandom", "rwm",
1932                                           "/dev/tty", "rwm",
1933                                           "/dev/net/tun", "rwm",
1934                                           /* Allow the container
1935                                            * access to ptys. However,
1936                                            * do not permit the
1937                                            * container to ever create
1938                                            * these device nodes. */
1939                                           "/dev/pts/ptmx", "rw",
1940                                           "char-pts", "rw");
1941                 if (r < 0)
1942                         return log_error_errno(r, "Failed to add device whitelist: %m");
1943
1944                 r = sd_bus_message_close_container(m);
1945                 if (r < 0)
1946                         return log_error_errno(r, "Failed to close container: %m");
1947
1948                 r = sd_bus_call(bus, m, 0, &error, NULL);
1949         }
1950
1951         if (r < 0) {
1952                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1953                 return r;
1954         }
1955
1956         return 0;
1957 }
1958
1959 static int terminate_machine(pid_t pid) {
1960         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1961         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1962         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1963         const char *path;
1964         int r;
1965
1966         if (!arg_register)
1967                 return 0;
1968
1969         r = sd_bus_default_system(&bus);
1970         if (r < 0)
1971                 return log_error_errno(r, "Failed to open system bus: %m");
1972
1973         r = sd_bus_call_method(
1974                         bus,
1975                         "org.freedesktop.machine1",
1976                         "/org/freedesktop/machine1",
1977                         "org.freedesktop.machine1.Manager",
1978                         "GetMachineByPID",
1979                         &error,
1980                         &reply,
1981                         "u",
1982                         (uint32_t) pid);
1983         if (r < 0) {
1984                 /* Note that the machine might already have been
1985                  * cleaned up automatically, hence don't consider it a
1986                  * failure if we cannot get the machine object. */
1987                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1988                 return 0;
1989         }
1990
1991         r = sd_bus_message_read(reply, "o", &path);
1992         if (r < 0)
1993                 return bus_log_parse_error(r);
1994
1995         r = sd_bus_call_method(
1996                         bus,
1997                         "org.freedesktop.machine1",
1998                         path,
1999                         "org.freedesktop.machine1.Machine",
2000                         "Terminate",
2001                         &error,
2002                         NULL,
2003                         NULL);
2004         if (r < 0) {
2005                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2006                 return 0;
2007         }
2008
2009         return 0;
2010 }
2011
2012 static int reset_audit_loginuid(void) {
2013         _cleanup_free_ char *p = NULL;
2014         int r;
2015
2016         if (arg_share_system)
2017                 return 0;
2018
2019         r = read_one_line_file("/proc/self/loginuid", &p);
2020         if (r == -ENOENT)
2021                 return 0;
2022         if (r < 0)
2023                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2024
2025         /* Already reset? */
2026         if (streq(p, "4294967295"))
2027                 return 0;
2028
2029         r = write_string_file("/proc/self/loginuid", "4294967295");
2030         if (r < 0) {
2031                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2032                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2033                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2034                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2035                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2036
2037                 sleep(5);
2038         }
2039
2040         return 0;
2041 }
2042
2043 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2044 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2045 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2046
2047 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2048         uint8_t result[8];
2049         size_t l, sz;
2050         uint8_t *v, *i;
2051         int r;
2052
2053         l = strlen(arg_machine);
2054         sz = sizeof(sd_id128_t) + l;
2055         if (idx > 0)
2056                 sz += sizeof(idx);
2057
2058         v = alloca(sz);
2059
2060         /* fetch some persistent data unique to the host */
2061         r = sd_id128_get_machine((sd_id128_t*) v);
2062         if (r < 0)
2063                 return r;
2064
2065         /* combine with some data unique (on this host) to this
2066          * container instance */
2067         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2068         if (idx > 0) {
2069                 idx = htole64(idx);
2070                 memcpy(i, &idx, sizeof(idx));
2071         }
2072
2073         /* Let's hash the host machine ID plus the container name. We
2074          * use a fixed, but originally randomly created hash key here. */
2075         siphash24(result, v, sz, hash_key.bytes);
2076
2077         assert_cc(ETH_ALEN <= sizeof(result));
2078         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2079
2080         /* see eth_random_addr in the kernel */
2081         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2082         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2083
2084         return 0;
2085 }
2086
2087 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2088         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2089         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2090         struct ether_addr mac_host, mac_container;
2091         int r, i;
2092
2093         if (!arg_private_network)
2094                 return 0;
2095
2096         if (!arg_network_veth)
2097                 return 0;
2098
2099         /* Use two different interface name prefixes depending whether
2100          * we are in bridge mode or not. */
2101         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2102                  arg_network_bridge ? "vb" : "ve", arg_machine);
2103
2104         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2105         if (r < 0)
2106                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2107
2108         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2109         if (r < 0)
2110                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2111
2112         r = sd_rtnl_open(&rtnl, 0);
2113         if (r < 0)
2114                 return log_error_errno(r, "Failed to connect to netlink: %m");
2115
2116         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2117         if (r < 0)
2118                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2119
2120         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2121         if (r < 0)
2122                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2123
2124         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2125         if (r < 0)
2126                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2127
2128         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2129         if (r < 0)
2130                 return log_error_errno(r, "Failed to open netlink container: %m");
2131
2132         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2133         if (r < 0)
2134                 return log_error_errno(r, "Failed to open netlink container: %m");
2135
2136         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2137         if (r < 0)
2138                 return log_error_errno(r, "Failed to open netlink container: %m");
2139
2140         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2141         if (r < 0)
2142                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2143
2144         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2145         if (r < 0)
2146                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2147
2148         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2149         if (r < 0)
2150                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2151
2152         r = sd_rtnl_message_close_container(m);
2153         if (r < 0)
2154                 return log_error_errno(r, "Failed to close netlink container: %m");
2155
2156         r = sd_rtnl_message_close_container(m);
2157         if (r < 0)
2158                 return log_error_errno(r, "Failed to close netlink container: %m");
2159
2160         r = sd_rtnl_message_close_container(m);
2161         if (r < 0)
2162                 return log_error_errno(r, "Failed to close netlink container: %m");
2163
2164         r = sd_rtnl_call(rtnl, m, 0, NULL);
2165         if (r < 0)
2166                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2167
2168         i = (int) if_nametoindex(iface_name);
2169         if (i <= 0)
2170                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2171
2172         *ifi = i;
2173
2174         return 0;
2175 }
2176
2177 static int setup_bridge(const char veth_name[], int *ifi) {
2178         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2179         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2180         int r, bridge;
2181
2182         if (!arg_private_network)
2183                 return 0;
2184
2185         if (!arg_network_veth)
2186                 return 0;
2187
2188         if (!arg_network_bridge)
2189                 return 0;
2190
2191         bridge = (int) if_nametoindex(arg_network_bridge);
2192         if (bridge <= 0)
2193                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2194
2195         *ifi = bridge;
2196
2197         r = sd_rtnl_open(&rtnl, 0);
2198         if (r < 0)
2199                 return log_error_errno(r, "Failed to connect to netlink: %m");
2200
2201         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2202         if (r < 0)
2203                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2204
2205         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2206         if (r < 0)
2207                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2208
2209         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2210         if (r < 0)
2211                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2212
2213         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2214         if (r < 0)
2215                 return log_error_errno(r, "Failed to add netlink master field: %m");
2216
2217         r = sd_rtnl_call(rtnl, m, 0, NULL);
2218         if (r < 0)
2219                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2220
2221         return 0;
2222 }
2223
2224 static int parse_interface(struct udev *udev, const char *name) {
2225         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2226         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2227         int ifi;
2228
2229         ifi = (int) if_nametoindex(name);
2230         if (ifi <= 0)
2231                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2232
2233         sprintf(ifi_str, "n%i", ifi);
2234         d = udev_device_new_from_device_id(udev, ifi_str);
2235         if (!d)
2236                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2237
2238         if (udev_device_get_is_initialized(d) <= 0) {
2239                 log_error("Network interface %s is not initialized yet.", name);
2240                 return -EBUSY;
2241         }
2242
2243         return ifi;
2244 }
2245
2246 static int move_network_interfaces(pid_t pid) {
2247         _cleanup_udev_unref_ struct udev *udev = NULL;
2248         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2249         char **i;
2250         int r;
2251
2252         if (!arg_private_network)
2253                 return 0;
2254
2255         if (strv_isempty(arg_network_interfaces))
2256                 return 0;
2257
2258         r = sd_rtnl_open(&rtnl, 0);
2259         if (r < 0)
2260                 return log_error_errno(r, "Failed to connect to netlink: %m");
2261
2262         udev = udev_new();
2263         if (!udev) {
2264                 log_error("Failed to connect to udev.");
2265                 return -ENOMEM;
2266         }
2267
2268         STRV_FOREACH(i, arg_network_interfaces) {
2269                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2270                 int ifi;
2271
2272                 ifi = parse_interface(udev, *i);
2273                 if (ifi < 0)
2274                         return ifi;
2275
2276                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2277                 if (r < 0)
2278                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2279
2280                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2281                 if (r < 0)
2282                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2283
2284                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2285                 if (r < 0)
2286                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2287         }
2288
2289         return 0;
2290 }
2291
2292 static int setup_macvlan(pid_t pid) {
2293         _cleanup_udev_unref_ struct udev *udev = NULL;
2294         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2295         unsigned idx = 0;
2296         char **i;
2297         int r;
2298
2299         if (!arg_private_network)
2300                 return 0;
2301
2302         if (strv_isempty(arg_network_macvlan))
2303                 return 0;
2304
2305         r = sd_rtnl_open(&rtnl, 0);
2306         if (r < 0)
2307                 return log_error_errno(r, "Failed to connect to netlink: %m");
2308
2309         udev = udev_new();
2310         if (!udev) {
2311                 log_error("Failed to connect to udev.");
2312                 return -ENOMEM;
2313         }
2314
2315         STRV_FOREACH(i, arg_network_macvlan) {
2316                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2317                 _cleanup_free_ char *n = NULL;
2318                 struct ether_addr mac;
2319                 int ifi;
2320
2321                 ifi = parse_interface(udev, *i);
2322                 if (ifi < 0)
2323                         return ifi;
2324
2325                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2326                 if (r < 0)
2327                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2328
2329                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2330                 if (r < 0)
2331                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2332
2333                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2334                 if (r < 0)
2335                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2336
2337                 n = strappend("mv-", *i);
2338                 if (!n)
2339                         return log_oom();
2340
2341                 strshorten(n, IFNAMSIZ-1);
2342
2343                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2344                 if (r < 0)
2345                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2346
2347                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2348                 if (r < 0)
2349                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2350
2351                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2352                 if (r < 0)
2353                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2354
2355                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2356                 if (r < 0)
2357                         return log_error_errno(r, "Failed to open netlink container: %m");
2358
2359                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2360                 if (r < 0)
2361                         return log_error_errno(r, "Failed to open netlink container: %m");
2362
2363                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2364                 if (r < 0)
2365                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2366
2367                 r = sd_rtnl_message_close_container(m);
2368                 if (r < 0)
2369                         return log_error_errno(r, "Failed to close netlink container: %m");
2370
2371                 r = sd_rtnl_message_close_container(m);
2372                 if (r < 0)
2373                         return log_error_errno(r, "Failed to close netlink container: %m");
2374
2375                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2376                 if (r < 0)
2377                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2378         }
2379
2380         return 0;
2381 }
2382
2383 static int setup_seccomp(void) {
2384
2385 #ifdef HAVE_SECCOMP
2386         static const int blacklist[] = {
2387                 SCMP_SYS(kexec_load),
2388                 SCMP_SYS(open_by_handle_at),
2389                 SCMP_SYS(init_module),
2390                 SCMP_SYS(finit_module),
2391                 SCMP_SYS(delete_module),
2392                 SCMP_SYS(iopl),
2393                 SCMP_SYS(ioperm),
2394                 SCMP_SYS(swapon),
2395                 SCMP_SYS(swapoff),
2396         };
2397
2398         scmp_filter_ctx seccomp;
2399         unsigned i;
2400         int r;
2401
2402         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2403         if (!seccomp)
2404                 return log_oom();
2405
2406         r = seccomp_add_secondary_archs(seccomp);
2407         if (r < 0) {
2408                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2409                 goto finish;
2410         }
2411
2412         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2413                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2414                 if (r == -EFAULT)
2415                         continue; /* unknown syscall */
2416                 if (r < 0) {
2417                         log_error_errno(r, "Failed to block syscall: %m");
2418                         goto finish;
2419                 }
2420         }
2421
2422         /*
2423            Audit is broken in containers, much of the userspace audit
2424            hookup will fail if running inside a container. We don't
2425            care and just turn off creation of audit sockets.
2426
2427            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2428            with EAFNOSUPPORT which audit userspace uses as indication
2429            that audit is disabled in the kernel.
2430          */
2431
2432         r = seccomp_rule_add(
2433                         seccomp,
2434                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2435                         SCMP_SYS(socket),
2436                         2,
2437                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2438                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2439         if (r < 0) {
2440                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2441                 goto finish;
2442         }
2443
2444         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2445         if (r < 0) {
2446                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2447                 goto finish;
2448         }
2449
2450         r = seccomp_load(seccomp);
2451         if (r < 0)
2452                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2453
2454 finish:
2455         seccomp_release(seccomp);
2456         return r;
2457 #else
2458         return 0;
2459 #endif
2460
2461 }
2462
2463 static int setup_propagate(const char *root) {
2464         const char *p, *q;
2465
2466         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2467         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2468         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2469         (void) mkdir_p(p, 0600);
2470
2471         q = strappenda(root, "/run/systemd/nspawn/incoming");
2472         mkdir_parents(q, 0755);
2473         mkdir_p(q, 0600);
2474
2475         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2476                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2477
2478         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2479                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2480
2481         return 0;
2482 }
2483
2484 static int setup_image(char **device_path, int *loop_nr) {
2485         struct loop_info64 info = {
2486                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2487         };
2488         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2489         _cleanup_free_ char* loopdev = NULL;
2490         struct stat st;
2491         int r, nr;
2492
2493         assert(device_path);
2494         assert(loop_nr);
2495         assert(arg_image);
2496
2497         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2498         if (fd < 0)
2499                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2500
2501         if (fstat(fd, &st) < 0)
2502                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2503
2504         if (S_ISBLK(st.st_mode)) {
2505                 char *p;
2506
2507                 p = strdup(arg_image);
2508                 if (!p)
2509                         return log_oom();
2510
2511                 *device_path = p;
2512
2513                 *loop_nr = -1;
2514
2515                 r = fd;
2516                 fd = -1;
2517
2518                 return r;
2519         }
2520
2521         if (!S_ISREG(st.st_mode)) {
2522                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2523                 return -EINVAL;
2524         }
2525
2526         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2527         if (control < 0)
2528                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2529
2530         nr = ioctl(control, LOOP_CTL_GET_FREE);
2531         if (nr < 0)
2532                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2533
2534         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2535                 return log_oom();
2536
2537         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2538         if (loop < 0)
2539                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2540
2541         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2542                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2543
2544         if (arg_read_only)
2545                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2546
2547         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2548                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2549
2550         *device_path = loopdev;
2551         loopdev = NULL;
2552
2553         *loop_nr = nr;
2554
2555         r = loop;
2556         loop = -1;
2557
2558         return r;
2559 }
2560
2561 static int dissect_image(
2562                 int fd,
2563                 char **root_device, bool *root_device_rw,
2564                 char **home_device, bool *home_device_rw,
2565                 char **srv_device, bool *srv_device_rw,
2566                 bool *secondary) {
2567
2568 #ifdef HAVE_BLKID
2569         int home_nr = -1, srv_nr = -1;
2570 #ifdef GPT_ROOT_NATIVE
2571         int root_nr = -1;
2572 #endif
2573 #ifdef GPT_ROOT_SECONDARY
2574         int secondary_root_nr = -1;
2575 #endif
2576
2577         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2578         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2579         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2580         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2581         _cleanup_udev_unref_ struct udev *udev = NULL;
2582         struct udev_list_entry *first, *item;
2583         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2584         const char *pttype = NULL;
2585         blkid_partlist pl;
2586         struct stat st;
2587         int r;
2588
2589         assert(fd >= 0);
2590         assert(root_device);
2591         assert(home_device);
2592         assert(srv_device);
2593         assert(secondary);
2594         assert(arg_image);
2595
2596         b = blkid_new_probe();
2597         if (!b)
2598                 return log_oom();
2599
2600         errno = 0;
2601         r = blkid_probe_set_device(b, fd, 0, 0);
2602         if (r != 0) {
2603                 if (errno == 0)
2604                         return log_oom();
2605
2606                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2607                 return -errno;
2608         }
2609
2610         blkid_probe_enable_partitions(b, 1);
2611         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2612
2613         errno = 0;
2614         r = blkid_do_safeprobe(b);
2615         if (r == -2 || r == 1) {
2616                 log_error("Failed to identify any partition table on %s.\n"
2617                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2618                 return -EINVAL;
2619         } else if (r != 0) {
2620                 if (errno == 0)
2621                         errno = EIO;
2622                 log_error_errno(errno, "Failed to probe: %m");
2623                 return -errno;
2624         }
2625
2626         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2627         if (!streq_ptr(pttype, "gpt")) {
2628                 log_error("Image %s does not carry a GUID Partition Table.\n"
2629                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2630                 return -EINVAL;
2631         }
2632
2633         errno = 0;
2634         pl = blkid_probe_get_partitions(b);
2635         if (!pl) {
2636                 if (errno == 0)
2637                         return log_oom();
2638
2639                 log_error("Failed to list partitions of %s", arg_image);
2640                 return -errno;
2641         }
2642
2643         udev = udev_new();
2644         if (!udev)
2645                 return log_oom();
2646
2647         if (fstat(fd, &st) < 0)
2648                 return log_error_errno(errno, "Failed to stat block device: %m");
2649
2650         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2651         if (!d)
2652                 return log_oom();
2653
2654         e = udev_enumerate_new(udev);
2655         if (!e)
2656                 return log_oom();
2657
2658         r = udev_enumerate_add_match_parent(e, d);
2659         if (r < 0)
2660                 return log_oom();
2661
2662         r = udev_enumerate_scan_devices(e);
2663         if (r < 0)
2664                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2665
2666         first = udev_enumerate_get_list_entry(e);
2667         udev_list_entry_foreach(item, first) {
2668                 _cleanup_udev_device_unref_ struct udev_device *q;
2669                 const char *stype, *node;
2670                 unsigned long long flags;
2671                 sd_id128_t type_id;
2672                 blkid_partition pp;
2673                 dev_t qn;
2674                 int nr;
2675
2676                 errno = 0;
2677                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2678                 if (!q) {
2679                         if (!errno)
2680                                 errno = ENOMEM;
2681
2682                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2683                         return -errno;
2684                 }
2685
2686                 qn = udev_device_get_devnum(q);
2687                 if (major(qn) == 0)
2688                         continue;
2689
2690                 if (st.st_rdev == qn)
2691                         continue;
2692
2693                 node = udev_device_get_devnode(q);
2694                 if (!node)
2695                         continue;
2696
2697                 pp = blkid_partlist_devno_to_partition(pl, qn);
2698                 if (!pp)
2699                         continue;
2700
2701                 flags = blkid_partition_get_flags(pp);
2702                 if (flags & GPT_FLAG_NO_AUTO)
2703                         continue;
2704
2705                 nr = blkid_partition_get_partno(pp);
2706                 if (nr < 0)
2707                         continue;
2708
2709                 stype = blkid_partition_get_type_string(pp);
2710                 if (!stype)
2711                         continue;
2712
2713                 if (sd_id128_from_string(stype, &type_id) < 0)
2714                         continue;
2715
2716                 if (sd_id128_equal(type_id, GPT_HOME)) {
2717
2718                         if (home && nr >= home_nr)
2719                                 continue;
2720
2721                         home_nr = nr;
2722                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2723
2724                         free(home);
2725                         home = strdup(node);
2726                         if (!home)
2727                                 return log_oom();
2728                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2729
2730                         if (srv && nr >= srv_nr)
2731                                 continue;
2732
2733                         srv_nr = nr;
2734                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2735
2736                         free(srv);
2737                         srv = strdup(node);
2738                         if (!srv)
2739                                 return log_oom();
2740                 }
2741 #ifdef GPT_ROOT_NATIVE
2742                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2743
2744                         if (root && nr >= root_nr)
2745                                 continue;
2746
2747                         root_nr = nr;
2748                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2749
2750                         free(root);
2751                         root = strdup(node);
2752                         if (!root)
2753                                 return log_oom();
2754                 }
2755 #endif
2756 #ifdef GPT_ROOT_SECONDARY
2757                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2758
2759                         if (secondary_root && nr >= secondary_root_nr)
2760                                 continue;
2761
2762                         secondary_root_nr = nr;
2763                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2764
2765
2766                         free(secondary_root);
2767                         secondary_root = strdup(node);
2768                         if (!secondary_root)
2769                                 return log_oom();
2770                 }
2771 #endif
2772         }
2773
2774         if (!root && !secondary_root) {
2775                 log_error("Failed to identify root partition in disk image %s.\n"
2776                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2777                 return -EINVAL;
2778         }
2779
2780         if (root) {
2781                 *root_device = root;
2782                 root = NULL;
2783
2784                 *root_device_rw = root_rw;
2785                 *secondary = false;
2786         } else if (secondary_root) {
2787                 *root_device = secondary_root;
2788                 secondary_root = NULL;
2789
2790                 *root_device_rw = secondary_root_rw;
2791                 *secondary = true;
2792         }
2793
2794         if (home) {
2795                 *home_device = home;
2796                 home = NULL;
2797
2798                 *home_device_rw = home_rw;
2799         }
2800
2801         if (srv) {
2802                 *srv_device = srv;
2803                 srv = NULL;
2804
2805                 *srv_device_rw = srv_rw;
2806         }
2807
2808         return 0;
2809 #else
2810         log_error("--image= is not supported, compiled without blkid support.");
2811         return -ENOTSUP;
2812 #endif
2813 }
2814
2815 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2816 #ifdef HAVE_BLKID
2817         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2818         const char *fstype, *p;
2819         int r;
2820
2821         assert(what);
2822         assert(where);
2823
2824         if (arg_read_only)
2825                 rw = false;
2826
2827         if (directory)
2828                 p = strappenda(where, directory);
2829         else
2830                 p = where;
2831
2832         errno = 0;
2833         b = blkid_new_probe_from_filename(what);
2834         if (!b) {
2835                 if (errno == 0)
2836                         return log_oom();
2837                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2838                 return -errno;
2839         }
2840
2841         blkid_probe_enable_superblocks(b, 1);
2842         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2843
2844         errno = 0;
2845         r = blkid_do_safeprobe(b);
2846         if (r == -1 || r == 1) {
2847                 log_error("Cannot determine file system type of %s", what);
2848                 return -EINVAL;
2849         } else if (r != 0) {
2850                 if (errno == 0)
2851                         errno = EIO;
2852                 log_error_errno(errno, "Failed to probe %s: %m", what);
2853                 return -errno;
2854         }
2855
2856         errno = 0;
2857         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2858                 if (errno == 0)
2859                         errno = EINVAL;
2860                 log_error("Failed to determine file system type of %s", what);
2861                 return -errno;
2862         }
2863
2864         if (streq(fstype, "crypto_LUKS")) {
2865                 log_error("nspawn currently does not support LUKS disk images.");
2866                 return -ENOTSUP;
2867         }
2868
2869         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2870                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2871
2872         return 0;
2873 #else
2874         log_error("--image= is not supported, compiled without blkid support.");
2875         return -ENOTSUP;
2876 #endif
2877 }
2878
2879 static int mount_devices(
2880                 const char *where,
2881                 const char *root_device, bool root_device_rw,
2882                 const char *home_device, bool home_device_rw,
2883                 const char *srv_device, bool srv_device_rw) {
2884         int r;
2885
2886         assert(where);
2887
2888         if (root_device) {
2889                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2890                 if (r < 0)
2891                         return log_error_errno(r, "Failed to mount root directory: %m");
2892         }
2893
2894         if (home_device) {
2895                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2896                 if (r < 0)
2897                         return log_error_errno(r, "Failed to mount home directory: %m");
2898         }
2899
2900         if (srv_device) {
2901                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2902                 if (r < 0)
2903                         return log_error_errno(r, "Failed to mount server data directory: %m");
2904         }
2905
2906         return 0;
2907 }
2908
2909 static void loop_remove(int nr, int *image_fd) {
2910         _cleanup_close_ int control = -1;
2911         int r;
2912
2913         if (nr < 0)
2914                 return;
2915
2916         if (image_fd && *image_fd >= 0) {
2917                 r = ioctl(*image_fd, LOOP_CLR_FD);
2918                 if (r < 0)
2919                         log_warning_errno(errno, "Failed to close loop image: %m");
2920                 *image_fd = safe_close(*image_fd);
2921         }
2922
2923         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2924         if (control < 0) {
2925                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2926                 return;
2927         }
2928
2929         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2930         if (r < 0)
2931                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2932 }
2933
2934 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2935         int pipe_fds[2];
2936         pid_t pid;
2937
2938         assert(database);
2939         assert(key);
2940         assert(rpid);
2941
2942         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2943                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2944
2945         pid = fork();
2946         if (pid < 0)
2947                 return log_error_errno(errno, "Failed to fork getent child: %m");
2948         else if (pid == 0) {
2949                 int nullfd;
2950                 char *empty_env = NULL;
2951
2952                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2953                         _exit(EXIT_FAILURE);
2954
2955                 if (pipe_fds[0] > 2)
2956                         safe_close(pipe_fds[0]);
2957                 if (pipe_fds[1] > 2)
2958                         safe_close(pipe_fds[1]);
2959
2960                 nullfd = open("/dev/null", O_RDWR);
2961                 if (nullfd < 0)
2962                         _exit(EXIT_FAILURE);
2963
2964                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2965                         _exit(EXIT_FAILURE);
2966
2967                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2968                         _exit(EXIT_FAILURE);
2969
2970                 if (nullfd > 2)
2971                         safe_close(nullfd);
2972
2973                 reset_all_signal_handlers();
2974                 close_all_fds(NULL, 0);
2975
2976                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2977                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2978                 _exit(EXIT_FAILURE);
2979         }
2980
2981         pipe_fds[1] = safe_close(pipe_fds[1]);
2982
2983         *rpid = pid;
2984
2985         return pipe_fds[0];
2986 }
2987
2988 static int change_uid_gid(char **_home) {
2989         char line[LINE_MAX], *x, *u, *g, *h;
2990         const char *word, *state;
2991         _cleanup_free_ uid_t *uids = NULL;
2992         _cleanup_free_ char *home = NULL;
2993         _cleanup_fclose_ FILE *f = NULL;
2994         _cleanup_close_ int fd = -1;
2995         unsigned n_uids = 0;
2996         size_t sz = 0, l;
2997         uid_t uid;
2998         gid_t gid;
2999         pid_t pid;
3000         int r;
3001
3002         assert(_home);
3003
3004         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3005                 /* Reset everything fully to 0, just in case */
3006
3007                 if (setgroups(0, NULL) < 0)
3008                         return log_error_errno(errno, "setgroups() failed: %m");
3009
3010                 if (setresgid(0, 0, 0) < 0)
3011                         return log_error_errno(errno, "setregid() failed: %m");
3012
3013                 if (setresuid(0, 0, 0) < 0)
3014                         return log_error_errno(errno, "setreuid() failed: %m");
3015
3016                 *_home = NULL;
3017                 return 0;
3018         }
3019
3020         /* First, get user credentials */
3021         fd = spawn_getent("passwd", arg_user, &pid);
3022         if (fd < 0)
3023                 return fd;
3024
3025         f = fdopen(fd, "r");
3026         if (!f)
3027                 return log_oom();
3028         fd = -1;
3029
3030         if (!fgets(line, sizeof(line), f)) {
3031
3032                 if (!ferror(f)) {
3033                         log_error("Failed to resolve user %s.", arg_user);
3034                         return -ESRCH;
3035                 }
3036
3037                 log_error_errno(errno, "Failed to read from getent: %m");
3038                 return -errno;
3039         }
3040
3041         truncate_nl(line);
3042
3043         wait_for_terminate_and_warn("getent passwd", pid, true);
3044
3045         x = strchr(line, ':');
3046         if (!x) {
3047                 log_error("/etc/passwd entry has invalid user field.");
3048                 return -EIO;
3049         }
3050
3051         u = strchr(x+1, ':');
3052         if (!u) {
3053                 log_error("/etc/passwd entry has invalid password field.");
3054                 return -EIO;
3055         }
3056
3057         u++;
3058         g = strchr(u, ':');
3059         if (!g) {
3060                 log_error("/etc/passwd entry has invalid UID field.");
3061                 return -EIO;
3062         }
3063
3064         *g = 0;
3065         g++;
3066         x = strchr(g, ':');
3067         if (!x) {
3068                 log_error("/etc/passwd entry has invalid GID field.");
3069                 return -EIO;
3070         }
3071
3072         *x = 0;
3073         h = strchr(x+1, ':');
3074         if (!h) {
3075                 log_error("/etc/passwd entry has invalid GECOS field.");
3076                 return -EIO;
3077         }
3078
3079         h++;
3080         x = strchr(h, ':');
3081         if (!x) {
3082                 log_error("/etc/passwd entry has invalid home directory field.");
3083                 return -EIO;
3084         }
3085
3086         *x = 0;
3087
3088         r = parse_uid(u, &uid);
3089         if (r < 0) {
3090                 log_error("Failed to parse UID of user.");
3091                 return -EIO;
3092         }
3093
3094         r = parse_gid(g, &gid);
3095         if (r < 0) {
3096                 log_error("Failed to parse GID of user.");
3097                 return -EIO;
3098         }
3099
3100         home = strdup(h);
3101         if (!home)
3102                 return log_oom();
3103
3104         /* Second, get group memberships */
3105         fd = spawn_getent("initgroups", arg_user, &pid);
3106         if (fd < 0)
3107                 return fd;
3108
3109         fclose(f);
3110         f = fdopen(fd, "r");
3111         if (!f)
3112                 return log_oom();
3113         fd = -1;
3114
3115         if (!fgets(line, sizeof(line), f)) {
3116                 if (!ferror(f)) {
3117                         log_error("Failed to resolve user %s.", arg_user);
3118                         return -ESRCH;
3119                 }
3120
3121                 log_error_errno(errno, "Failed to read from getent: %m");
3122                 return -errno;
3123         }
3124
3125         truncate_nl(line);
3126
3127         wait_for_terminate_and_warn("getent initgroups", pid, true);
3128
3129         /* Skip over the username and subsequent separator whitespace */
3130         x = line;
3131         x += strcspn(x, WHITESPACE);
3132         x += strspn(x, WHITESPACE);
3133
3134         FOREACH_WORD(word, l, x, state) {
3135                 char c[l+1];
3136
3137                 memcpy(c, word, l);
3138                 c[l] = 0;
3139
3140                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3141                         return log_oom();
3142
3143                 r = parse_uid(c, &uids[n_uids++]);
3144                 if (r < 0) {
3145                         log_error("Failed to parse group data from getent.");
3146                         return -EIO;
3147                 }
3148         }
3149
3150         r = mkdir_parents(home, 0775);
3151         if (r < 0)
3152                 return log_error_errno(r, "Failed to make home root directory: %m");
3153
3154         r = mkdir_safe(home, 0755, uid, gid);
3155         if (r < 0 && r != -EEXIST)
3156                 return log_error_errno(r, "Failed to make home directory: %m");
3157
3158         fchown(STDIN_FILENO, uid, gid);
3159         fchown(STDOUT_FILENO, uid, gid);
3160         fchown(STDERR_FILENO, uid, gid);
3161
3162         if (setgroups(n_uids, uids) < 0)
3163                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3164
3165         if (setresgid(gid, gid, gid) < 0)
3166                 return log_error_errno(errno, "setregid() failed: %m");
3167
3168         if (setresuid(uid, uid, uid) < 0)
3169                 return log_error_errno(errno, "setreuid() failed: %m");
3170
3171         if (_home) {
3172                 *_home = home;
3173                 home = NULL;
3174         }
3175
3176         return 0;
3177 }
3178
3179 /*
3180  * Return values:
3181  * < 0 : wait_for_terminate() failed to get the state of the
3182  *       container, the container was terminated by a signal, or
3183  *       failed for an unknown reason.  No change is made to the
3184  *       container argument.
3185  * > 0 : The program executed in the container terminated with an
3186  *       error.  The exit code of the program executed in the
3187  *       container is returned.  The container argument has been set
3188  *       to CONTAINER_TERMINATED.
3189  *   0 : The container is being rebooted, has been shut down or exited
3190  *       successfully.  The container argument has been set to either
3191  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3192  *
3193  * That is, success is indicated by a return value of zero, and an
3194  * error is indicated by a non-zero value.
3195  */
3196 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3197         siginfo_t status;
3198         int r;
3199
3200         r = wait_for_terminate(pid, &status);
3201         if (r < 0)
3202                 return log_warning_errno(r, "Failed to wait for container: %m");
3203
3204         switch (status.si_code) {
3205
3206         case CLD_EXITED:
3207                 if (status.si_status == 0) {
3208                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3209
3210                 } else
3211                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3212
3213                 *container = CONTAINER_TERMINATED;
3214                 return status.si_status;
3215
3216         case CLD_KILLED:
3217                 if (status.si_status == SIGINT) {
3218
3219                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3220                         *container = CONTAINER_TERMINATED;
3221                         return 0;
3222
3223                 } else if (status.si_status == SIGHUP) {
3224
3225                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3226                         *container = CONTAINER_REBOOTED;
3227                         return 0;
3228                 }
3229
3230                 /* CLD_KILLED fallthrough */
3231
3232         case CLD_DUMPED:
3233                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3234                 return -EIO;
3235
3236         default:
3237                 log_error("Container %s failed due to unknown reason.", arg_machine);
3238                 return -EIO;
3239         }
3240
3241         return r;
3242 }
3243
3244 static void nop_handler(int sig) {}
3245
3246 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3247         pid_t pid;
3248
3249         pid = PTR_TO_UINT32(userdata);
3250         if (pid > 0) {
3251                 if (kill(pid, SIGRTMIN+3) >= 0) {
3252                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3253                         sd_event_source_set_userdata(s, NULL);
3254                         return 0;
3255                 }
3256         }
3257
3258         sd_event_exit(sd_event_source_get_event(s), 0);
3259         return 0;
3260 }
3261
3262 static int determine_names(void) {
3263         int r;
3264
3265         if (!arg_image && !arg_directory) {
3266                 if (arg_machine) {
3267                         _cleanup_(image_unrefp) Image *i = NULL;
3268
3269                         r = image_find(arg_machine, &i);
3270                         if (r < 0)
3271                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3272                         else if (r == 0) {
3273                                 log_error("No image for machine '%s': %m", arg_machine);
3274                                 return -ENOENT;
3275                         }
3276
3277                         if (i->type == IMAGE_GPT)
3278                                 r = set_sanitized_path(&arg_image, i->path);
3279                         else
3280                                 r = set_sanitized_path(&arg_directory, i->path);
3281                         if (r < 0)
3282                                 return log_error_errno(r, "Invalid image directory: %m");
3283
3284                         arg_read_only = arg_read_only || i->read_only;
3285                 } else
3286                         arg_directory = get_current_dir_name();
3287
3288                 if (!arg_directory && !arg_machine) {
3289                         log_error("Failed to determine path, please use -D or -i.");
3290                         return -EINVAL;
3291                 }
3292         }
3293
3294         if (!arg_machine) {
3295                 if (arg_directory && path_equal(arg_directory, "/"))
3296                         arg_machine = gethostname_malloc();
3297                 else
3298                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3299
3300                 if (!arg_machine)
3301                         return log_oom();
3302
3303                 hostname_cleanup(arg_machine, false);
3304                 if (!machine_name_is_valid(arg_machine)) {
3305                         log_error("Failed to determine machine name automatically, please use -M.");
3306                         return -EINVAL;
3307                 }
3308
3309                 if (arg_ephemeral) {
3310                         char *b;
3311
3312                         /* Add a random suffix when this is an
3313                          * ephemeral machine, so that we can run many
3314                          * instances at once without manually having
3315                          * to specify -M each time. */
3316
3317                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3318                                 return log_oom();
3319
3320                         free(arg_machine);
3321                         arg_machine = b;
3322                 }
3323         }
3324
3325         return 0;
3326 }
3327
3328 int main(int argc, char *argv[]) {
3329
3330         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3331         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3332         _cleanup_close_ int master = -1, image_fd = -1;
3333         _cleanup_fdset_free_ FDSet *fds = NULL;
3334         int r, n_fd_passed, loop_nr = -1;
3335         char veth_name[IFNAMSIZ];
3336         bool secondary = false, remove_subvol = false;
3337         sigset_t mask, mask_chld;
3338         pid_t pid = 0;
3339         int ret = EXIT_SUCCESS;
3340         union in_addr_union exposed = {};
3341
3342         log_parse_environment();
3343         log_open();
3344
3345         r = parse_argv(argc, argv);
3346         if (r <= 0)
3347                 goto finish;
3348
3349         r = determine_names();
3350         if (r < 0)
3351                 goto finish;
3352
3353         if (geteuid() != 0) {
3354                 log_error("Need to be root.");
3355                 r = -EPERM;
3356                 goto finish;
3357         }
3358
3359         if (sd_booted() <= 0) {
3360                 log_error("Not running on a systemd system.");
3361                 r = -EINVAL;
3362                 goto finish;
3363         }
3364
3365         log_close();
3366         n_fd_passed = sd_listen_fds(false);
3367         if (n_fd_passed > 0) {
3368                 r = fdset_new_listen_fds(&fds, false);
3369                 if (r < 0) {
3370                         log_error_errno(r, "Failed to collect file descriptors: %m");
3371                         goto finish;
3372                 }
3373         }
3374         fdset_close_others(fds);
3375         log_open();
3376
3377         if (arg_directory) {
3378                 assert(!arg_image);
3379
3380                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3381                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3382                         r = -EINVAL;
3383                         goto finish;
3384                 }
3385
3386                 if (arg_template) {
3387                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3388                         if (r == -EEXIST) {
3389                                 if (!arg_quiet)
3390                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3391                         } else if (r < 0) {
3392                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3393                                 goto finish;
3394                         } else {
3395                                 if (!arg_quiet)
3396                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
3397                         }
3398
3399                 } else if (arg_ephemeral) {
3400                         char *np;
3401
3402                         /* If the specified path is a mount point we
3403                          * generate the new snapshot immediately
3404                          * inside it under a random name. However if
3405                          * the specified is not a mount point we
3406                          * create the new snapshot in the parent
3407                          * directory, just next to it. */
3408                         r = path_is_mount_point(arg_directory, false);
3409                         if (r < 0) {
3410                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3411                                 goto finish;
3412                         }
3413                         if (r > 0)
3414                                 r = tempfn_random_child(arg_directory, &np);
3415                         else
3416                                 r = tempfn_random(arg_directory, &np);
3417                         if (r < 0) {
3418                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3419                                 goto finish;
3420                         }
3421
3422                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3423                         if (r < 0) {
3424                                 free(np);
3425                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3426                                 goto finish;
3427                         }
3428
3429                         free(arg_directory);
3430                         arg_directory = np;
3431
3432                         remove_subvol = true;
3433                 }
3434
3435                 if (arg_boot) {
3436                         if (path_is_os_tree(arg_directory) <= 0) {
3437                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3438                                 r = -EINVAL;
3439                                 goto finish;
3440                         }
3441                 } else {
3442                         const char *p;
3443
3444                         p = strappenda(arg_directory,
3445                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3446                         if (access(p, F_OK) < 0) {
3447                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3448                                 r = -EINVAL;
3449                                 goto finish;
3450                         }
3451                 }
3452
3453         } else {
3454                 char template[] = "/tmp/nspawn-root-XXXXXX";
3455
3456                 assert(arg_image);
3457                 assert(!arg_template);
3458
3459                 if (!mkdtemp(template)) {
3460                         log_error_errno(errno, "Failed to create temporary directory: %m");
3461                         r = -errno;
3462                         goto finish;
3463                 }
3464
3465                 arg_directory = strdup(template);
3466                 if (!arg_directory) {
3467                         r = log_oom();
3468                         goto finish;
3469                 }
3470
3471                 image_fd = setup_image(&device_path, &loop_nr);
3472                 if (image_fd < 0) {
3473                         r = image_fd;
3474                         goto finish;
3475                 }
3476
3477                 r = dissect_image(image_fd,
3478                                   &root_device, &root_device_rw,
3479                                   &home_device, &home_device_rw,
3480                                   &srv_device, &srv_device_rw,
3481                                   &secondary);
3482                 if (r < 0)
3483                         goto finish;
3484         }
3485
3486         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3487         if (master < 0) {
3488                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3489                 goto finish;
3490         }
3491
3492         r = ptsname_malloc(master, &console);
3493         if (r < 0) {
3494                 r = log_error_errno(r, "Failed to determine tty name: %m");
3495                 goto finish;
3496         }
3497
3498         if (!arg_quiet)
3499                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3500                          arg_machine, arg_image ?: arg_directory);
3501
3502         if (unlockpt(master) < 0) {
3503                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3504                 goto finish;
3505         }
3506
3507         assert_se(sigemptyset(&mask) == 0);
3508         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3509         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3510
3511         assert_se(sigemptyset(&mask_chld) == 0);
3512         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3513
3514         for (;;) {
3515                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3516                 ContainerStatus container_status;
3517                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3518                 struct sigaction sa = {
3519                         .sa_handler = nop_handler,
3520                         .sa_flags = SA_NOCLDSTOP,
3521                 };
3522
3523                 r = barrier_create(&barrier);
3524                 if (r < 0) {
3525                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3526                         goto finish;
3527                 }
3528
3529                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3530                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3531                         goto finish;
3532                 }
3533
3534                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3535                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3536                         goto finish;
3537                 }
3538
3539                 /* Child can be killed before execv(), so handle SIGCHLD
3540                  * in order to interrupt parent's blocking calls and
3541                  * give it a chance to call wait() and terminate. */
3542                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3543                 if (r < 0) {
3544                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3545                         goto finish;
3546                 }
3547
3548                 r = sigaction(SIGCHLD, &sa, NULL);
3549                 if (r < 0) {
3550                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3551                         goto finish;
3552                 }
3553
3554                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3555                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3556                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3557                 if (pid < 0) {
3558                         if (errno == EINVAL)
3559                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3560                         else
3561                                 r = log_error_errno(errno, "clone() failed: %m");
3562
3563                         goto finish;
3564                 }
3565
3566                 if (pid == 0) {
3567                         /* child */
3568                         _cleanup_free_ char *home = NULL;
3569                         unsigned n_env = 2;
3570                         const char *envp[] = {
3571                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3572                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3573                                 NULL, /* TERM */
3574                                 NULL, /* HOME */
3575                                 NULL, /* USER */
3576                                 NULL, /* LOGNAME */
3577                                 NULL, /* container_uuid */
3578                                 NULL, /* LISTEN_FDS */
3579                                 NULL, /* LISTEN_PID */
3580                                 NULL
3581                         };
3582