chiark / gitweb /
ce9a9e84f69af363b33739a7bba64140f27f9f76
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #ifdef HAVE_BLKID
56 #include <blkid/blkid.h>
57 #endif
58
59 #include "sd-daemon.h"
60 #include "sd-bus.h"
61 #include "sd-id128.h"
62 #include "sd-rtnl.h"
63 #include "log.h"
64 #include "util.h"
65 #include "mkdir.h"
66 #include "macro.h"
67 #include "audit.h"
68 #include "missing.h"
69 #include "cgroup-util.h"
70 #include "strv.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
74 #include "fdset.h"
75 #include "build.h"
76 #include "fileio.h"
77 #include "bus-util.h"
78 #include "bus-error.h"
79 #include "ptyfwd.h"
80 #include "bus-kernel.h"
81 #include "env-util.h"
82 #include "def.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
86 #include "gpt.h"
87 #include "siphash24.h"
88 #include "copy.h"
89 #include "base-filesystem.h"
90 #include "barrier.h"
91 #include "event-util.h"
92 #include "capability.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
96 #include "list.h"
97 #include "in-addr-util.h"
98 #include "fw-util.h"
99 #include "local-addresses.h"
100
101 #ifdef HAVE_SECCOMP
102 #include "seccomp-util.h"
103 #endif
104
105 typedef struct ExposePort {
106         int protocol;
107         uint16_t host_port;
108         uint16_t container_port;
109         LIST_FIELDS(struct ExposePort, ports);
110 } ExposePort;
111
112 typedef enum ContainerStatus {
113         CONTAINER_TERMINATED,
114         CONTAINER_REBOOTED
115 } ContainerStatus;
116
117 typedef enum LinkJournal {
118         LINK_NO,
119         LINK_AUTO,
120         LINK_HOST,
121         LINK_GUEST
122 } LinkJournal;
123
124 typedef enum Volatile {
125         VOLATILE_NO,
126         VOLATILE_YES,
127         VOLATILE_STATE,
128 } Volatile;
129
130 static char *arg_directory = NULL;
131 static char *arg_template = NULL;
132 static char *arg_user = NULL;
133 static sd_id128_t arg_uuid = {};
134 static char *arg_machine = NULL;
135 static const char *arg_selinux_context = NULL;
136 static const char *arg_selinux_apifs_context = NULL;
137 static const char *arg_slice = NULL;
138 static bool arg_private_network = false;
139 static bool arg_read_only = false;
140 static bool arg_boot = false;
141 static bool arg_ephemeral = false;
142 static LinkJournal arg_link_journal = LINK_AUTO;
143 static bool arg_link_journal_try = false;
144 static uint64_t arg_retain =
145         (1ULL << CAP_CHOWN) |
146         (1ULL << CAP_DAC_OVERRIDE) |
147         (1ULL << CAP_DAC_READ_SEARCH) |
148         (1ULL << CAP_FOWNER) |
149         (1ULL << CAP_FSETID) |
150         (1ULL << CAP_IPC_OWNER) |
151         (1ULL << CAP_KILL) |
152         (1ULL << CAP_LEASE) |
153         (1ULL << CAP_LINUX_IMMUTABLE) |
154         (1ULL << CAP_NET_BIND_SERVICE) |
155         (1ULL << CAP_NET_BROADCAST) |
156         (1ULL << CAP_NET_RAW) |
157         (1ULL << CAP_SETGID) |
158         (1ULL << CAP_SETFCAP) |
159         (1ULL << CAP_SETPCAP) |
160         (1ULL << CAP_SETUID) |
161         (1ULL << CAP_SYS_ADMIN) |
162         (1ULL << CAP_SYS_CHROOT) |
163         (1ULL << CAP_SYS_NICE) |
164         (1ULL << CAP_SYS_PTRACE) |
165         (1ULL << CAP_SYS_TTY_CONFIG) |
166         (1ULL << CAP_SYS_RESOURCE) |
167         (1ULL << CAP_SYS_BOOT) |
168         (1ULL << CAP_AUDIT_WRITE) |
169         (1ULL << CAP_AUDIT_CONTROL) |
170         (1ULL << CAP_MKNOD);
171 static char **arg_bind = NULL;
172 static char **arg_bind_ro = NULL;
173 static char **arg_tmpfs = NULL;
174 static char **arg_setenv = NULL;
175 static bool arg_quiet = false;
176 static bool arg_share_system = false;
177 static bool arg_register = true;
178 static bool arg_keep_unit = false;
179 static char **arg_network_interfaces = NULL;
180 static char **arg_network_macvlan = NULL;
181 static bool arg_network_veth = false;
182 static const char *arg_network_bridge = NULL;
183 static unsigned long arg_personality = 0xffffffffLU;
184 static char *arg_image = NULL;
185 static Volatile arg_volatile = VOLATILE_NO;
186 static ExposePort *arg_expose_ports = NULL;
187
188 static void help(void) {
189         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
190                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
191                "  -h --help                 Show this help\n"
192                "     --version              Print version string\n"
193                "  -q --quiet                Do not show status information\n"
194                "  -D --directory=PATH       Root directory for the container\n"
195                "     --template=PATH        Initialize root directory from template directory,\n"
196                "                            if missing\n"
197                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
198                "                            remove it after exit\n"
199                "  -i --image=PATH           File system device or disk image for the container\n"
200                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
201                "  -u --user=USER            Run the command under specified user or uid\n"
202                "  -M --machine=NAME         Set the machine name for the container\n"
203                "     --uuid=UUID            Set a specific machine UUID for the container\n"
204                "  -S --slice=SLICE          Place the container in the specified slice\n"
205                "     --private-network      Disable network in container\n"
206                "     --network-interface=INTERFACE\n"
207                "                            Assign an existing network interface to the\n"
208                "                            container\n"
209                "     --network-macvlan=INTERFACE\n"
210                "                            Create a macvlan network interface based on an\n"
211                "                            existing network interface to the container\n"
212                "  -n --network-veth         Add a virtual ethernet connection between host\n"
213                "                            and container\n"
214                "     --network-bridge=INTERFACE\n"
215                "                            Add a virtual ethernet connection between host\n"
216                "                            and container and add it to an existing bridge on\n"
217                "                            the host\n"
218                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
219                "                            Expose a container IP port ont the host\n"
220                "  -Z --selinux-context=SECLABEL\n"
221                "                            Set the SELinux security context to be used by\n"
222                "                            processes in the container\n"
223                "  -L --selinux-apifs-context=SECLABEL\n"
224                "                            Set the SELinux security context to be used by\n"
225                "                            API/tmpfs file systems in the container\n"
226                "     --capability=CAP       In addition to the default, retain specified\n"
227                "                            capability\n"
228                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
229                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
230                "                            try-guest, try-host\n"
231                "  -j                        Equivalent to --link-journal=try-guest\n"
232                "     --read-only            Mount the root directory read-only\n"
233                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
234                "                            the container\n"
235                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
236                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
237                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
238                "     --share-system         Share system namespaces with host\n"
239                "     --register=BOOLEAN     Register container as machine\n"
240                "     --keep-unit            Do not register a scope for the machine, reuse\n"
241                "                            the service unit nspawn is running in\n"
242                "     --volatile[=MODE]      Run the system in volatile mode\n"
243                , program_invocation_short_name);
244 }
245
246 static int set_sanitized_path(char **b, const char *path) {
247         char *p;
248
249         assert(b);
250         assert(path);
251
252         p = canonicalize_file_name(path);
253         if (!p) {
254                 if (errno != ENOENT)
255                         return -errno;
256
257                 p = path_make_absolute_cwd(path);
258                 if (!p)
259                         return -ENOMEM;
260         }
261
262         free(*b);
263         *b = path_kill_slashes(p);
264         return 0;
265 }
266
267 static int parse_argv(int argc, char *argv[]) {
268
269         enum {
270                 ARG_VERSION = 0x100,
271                 ARG_PRIVATE_NETWORK,
272                 ARG_UUID,
273                 ARG_READ_ONLY,
274                 ARG_CAPABILITY,
275                 ARG_DROP_CAPABILITY,
276                 ARG_LINK_JOURNAL,
277                 ARG_BIND,
278                 ARG_BIND_RO,
279                 ARG_TMPFS,
280                 ARG_SETENV,
281                 ARG_SHARE_SYSTEM,
282                 ARG_REGISTER,
283                 ARG_KEEP_UNIT,
284                 ARG_NETWORK_INTERFACE,
285                 ARG_NETWORK_MACVLAN,
286                 ARG_NETWORK_BRIDGE,
287                 ARG_PERSONALITY,
288                 ARG_VOLATILE,
289                 ARG_TEMPLATE,
290         };
291
292         static const struct option options[] = {
293                 { "help",                  no_argument,       NULL, 'h'                   },
294                 { "version",               no_argument,       NULL, ARG_VERSION           },
295                 { "directory",             required_argument, NULL, 'D'                   },
296                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
297                 { "ephemeral",             no_argument,       NULL, 'x'                   },
298                 { "user",                  required_argument, NULL, 'u'                   },
299                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
300                 { "boot",                  no_argument,       NULL, 'b'                   },
301                 { "uuid",                  required_argument, NULL, ARG_UUID              },
302                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
303                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
304                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
305                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
306                 { "bind",                  required_argument, NULL, ARG_BIND              },
307                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
308                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
309                 { "machine",               required_argument, NULL, 'M'                   },
310                 { "slice",                 required_argument, NULL, 'S'                   },
311                 { "setenv",                required_argument, NULL, ARG_SETENV            },
312                 { "selinux-context",       required_argument, NULL, 'Z'                   },
313                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
314                 { "quiet",                 no_argument,       NULL, 'q'                   },
315                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
316                 { "register",              required_argument, NULL, ARG_REGISTER          },
317                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
318                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
319                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
320                 { "network-veth",          no_argument,       NULL, 'n'                   },
321                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
322                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
323                 { "image",                 required_argument, NULL, 'i'                   },
324                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
325                 { "port",                  required_argument, NULL, 'p'                   },
326                 {}
327         };
328
329         int c, r;
330         uint64_t plus = 0, minus = 0;
331
332         assert(argc >= 0);
333         assert(argv);
334
335         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
336
337                 switch (c) {
338
339                 case 'h':
340                         help();
341                         return 0;
342
343                 case ARG_VERSION:
344                         puts(PACKAGE_STRING);
345                         puts(SYSTEMD_FEATURES);
346                         return 0;
347
348                 case 'D':
349                         r = set_sanitized_path(&arg_directory, optarg);
350                         if (r < 0)
351                                 return log_error_errno(r, "Invalid root directory: %m");
352
353                         break;
354
355                 case ARG_TEMPLATE:
356                         r = set_sanitized_path(&arg_template, optarg);
357                         if (r < 0)
358                                 return log_error_errno(r, "Invalid template directory: %m");
359
360                         break;
361
362                 case 'i':
363                         r = set_sanitized_path(&arg_image, optarg);
364                         if (r < 0)
365                                 return log_error_errno(r, "Invalid image path: %m");
366
367                         break;
368
369                 case 'x':
370                         arg_ephemeral = true;
371                         break;
372
373                 case 'u':
374                         free(arg_user);
375                         arg_user = strdup(optarg);
376                         if (!arg_user)
377                                 return log_oom();
378
379                         break;
380
381                 case ARG_NETWORK_BRIDGE:
382                         arg_network_bridge = optarg;
383
384                         /* fall through */
385
386                 case 'n':
387                         arg_network_veth = true;
388                         arg_private_network = true;
389                         break;
390
391                 case ARG_NETWORK_INTERFACE:
392                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
393                                 return log_oom();
394
395                         arg_private_network = true;
396                         break;
397
398                 case ARG_NETWORK_MACVLAN:
399                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
400                                 return log_oom();
401
402                         /* fall through */
403
404                 case ARG_PRIVATE_NETWORK:
405                         arg_private_network = true;
406                         break;
407
408                 case 'b':
409                         arg_boot = true;
410                         break;
411
412                 case ARG_UUID:
413                         r = sd_id128_from_string(optarg, &arg_uuid);
414                         if (r < 0) {
415                                 log_error("Invalid UUID: %s", optarg);
416                                 return r;
417                         }
418                         break;
419
420                 case 'S':
421                         arg_slice = optarg;
422                         break;
423
424                 case 'M':
425                         if (isempty(optarg)) {
426                                 free(arg_machine);
427                                 arg_machine = NULL;
428                         } else {
429                                 if (!machine_name_is_valid(optarg)) {
430                                         log_error("Invalid machine name: %s", optarg);
431                                         return -EINVAL;
432                                 }
433
434                                 r = free_and_strdup(&arg_machine, optarg);
435                                 if (r < 0)
436                                         return log_oom();
437
438                                 break;
439                         }
440
441                 case 'Z':
442                         arg_selinux_context = optarg;
443                         break;
444
445                 case 'L':
446                         arg_selinux_apifs_context = optarg;
447                         break;
448
449                 case ARG_READ_ONLY:
450                         arg_read_only = true;
451                         break;
452
453                 case ARG_CAPABILITY:
454                 case ARG_DROP_CAPABILITY: {
455                         const char *state, *word;
456                         size_t length;
457
458                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
459                                 _cleanup_free_ char *t;
460
461                                 t = strndup(word, length);
462                                 if (!t)
463                                         return log_oom();
464
465                                 if (streq(t, "all")) {
466                                         if (c == ARG_CAPABILITY)
467                                                 plus = (uint64_t) -1;
468                                         else
469                                                 minus = (uint64_t) -1;
470                                 } else {
471                                         int cap;
472
473                                         cap = capability_from_name(t);
474                                         if (cap < 0) {
475                                                 log_error("Failed to parse capability %s.", t);
476                                                 return -EINVAL;
477                                         }
478
479                                         if (c == ARG_CAPABILITY)
480                                                 plus |= 1ULL << (uint64_t) cap;
481                                         else
482                                                 minus |= 1ULL << (uint64_t) cap;
483                                 }
484                         }
485
486                         break;
487                 }
488
489                 case 'j':
490                         arg_link_journal = LINK_GUEST;
491                         arg_link_journal_try = true;
492                         break;
493
494                 case ARG_LINK_JOURNAL:
495                         if (streq(optarg, "auto")) {
496                                 arg_link_journal = LINK_AUTO;
497                                 arg_link_journal_try = false;
498                         } else if (streq(optarg, "no")) {
499                                 arg_link_journal = LINK_NO;
500                                 arg_link_journal_try = false;
501                         } else if (streq(optarg, "guest")) {
502                                 arg_link_journal = LINK_GUEST;
503                                 arg_link_journal_try = false;
504                         } else if (streq(optarg, "host")) {
505                                 arg_link_journal = LINK_HOST;
506                                 arg_link_journal_try = false;
507                         } else if (streq(optarg, "try-guest")) {
508                                 arg_link_journal = LINK_GUEST;
509                                 arg_link_journal_try = true;
510                         } else if (streq(optarg, "try-host")) {
511                                 arg_link_journal = LINK_HOST;
512                                 arg_link_journal_try = true;
513                         } else {
514                                 log_error("Failed to parse link journal mode %s", optarg);
515                                 return -EINVAL;
516                         }
517
518                         break;
519
520                 case ARG_BIND:
521                 case ARG_BIND_RO: {
522                         _cleanup_free_ char *a = NULL, *b = NULL;
523                         char *e;
524                         char ***x;
525
526                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
527
528                         e = strchr(optarg, ':');
529                         if (e) {
530                                 a = strndup(optarg, e - optarg);
531                                 b = strdup(e + 1);
532                         } else {
533                                 a = strdup(optarg);
534                                 b = strdup(optarg);
535                         }
536
537                         if (!a || !b)
538                                 return log_oom();
539
540                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
541                                 log_error("Invalid bind mount specification: %s", optarg);
542                                 return -EINVAL;
543                         }
544
545                         r = strv_extend(x, a);
546                         if (r < 0)
547                                 return log_oom();
548
549                         r = strv_extend(x, b);
550                         if (r < 0)
551                                 return log_oom();
552
553                         break;
554                 }
555
556                 case ARG_TMPFS: {
557                         _cleanup_free_ char *a = NULL, *b = NULL;
558                         char *e;
559
560                         e = strchr(optarg, ':');
561                         if (e) {
562                                 a = strndup(optarg, e - optarg);
563                                 b = strdup(e + 1);
564                         } else {
565                                 a = strdup(optarg);
566                                 b = strdup("mode=0755");
567                         }
568
569                         if (!a || !b)
570                                 return log_oom();
571
572                         if (!path_is_absolute(a)) {
573                                 log_error("Invalid tmpfs specification: %s", optarg);
574                                 return -EINVAL;
575                         }
576
577                         r = strv_push(&arg_tmpfs, a);
578                         if (r < 0)
579                                 return log_oom();
580
581                         a = NULL;
582
583                         r = strv_push(&arg_tmpfs, b);
584                         if (r < 0)
585                                 return log_oom();
586
587                         b = NULL;
588
589                         break;
590                 }
591
592                 case ARG_SETENV: {
593                         char **n;
594
595                         if (!env_assignment_is_valid(optarg)) {
596                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
597                                 return -EINVAL;
598                         }
599
600                         n = strv_env_set(arg_setenv, optarg);
601                         if (!n)
602                                 return log_oom();
603
604                         strv_free(arg_setenv);
605                         arg_setenv = n;
606                         break;
607                 }
608
609                 case 'q':
610                         arg_quiet = true;
611                         break;
612
613                 case ARG_SHARE_SYSTEM:
614                         arg_share_system = true;
615                         break;
616
617                 case ARG_REGISTER:
618                         r = parse_boolean(optarg);
619                         if (r < 0) {
620                                 log_error("Failed to parse --register= argument: %s", optarg);
621                                 return r;
622                         }
623
624                         arg_register = r;
625                         break;
626
627                 case ARG_KEEP_UNIT:
628                         arg_keep_unit = true;
629                         break;
630
631                 case ARG_PERSONALITY:
632
633                         arg_personality = personality_from_string(optarg);
634                         if (arg_personality == 0xffffffffLU) {
635                                 log_error("Unknown or unsupported personality '%s'.", optarg);
636                                 return -EINVAL;
637                         }
638
639                         break;
640
641                 case ARG_VOLATILE:
642
643                         if (!optarg)
644                                 arg_volatile = VOLATILE_YES;
645                         else {
646                                 r = parse_boolean(optarg);
647                                 if (r < 0) {
648                                         if (streq(optarg, "state"))
649                                                 arg_volatile = VOLATILE_STATE;
650                                         else {
651                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
652                                                 return r;
653                                         }
654                                 } else
655                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
656                         }
657
658                         break;
659
660                 case 'p': {
661                         const char *split, *e;
662                         uint16_t container_port, host_port;
663                         int protocol;
664                         ExposePort *p;
665
666                         if ((e = startswith(optarg, "tcp:")))
667                                 protocol = IPPROTO_TCP;
668                         else if ((e = startswith(optarg, "udp:")))
669                                 protocol = IPPROTO_UDP;
670                         else {
671                                 e = optarg;
672                                 protocol = IPPROTO_TCP;
673                         }
674
675                         split = strchr(e, ':');
676                         if (split) {
677                                 char v[split - e + 1];
678
679                                 memcpy(v, e, split - e);
680                                 v[split - e] = 0;
681
682                                 r = safe_atou16(v, &host_port);
683                                 if (r < 0 || host_port <= 0) {
684                                         log_error("Failed to parse host port: %s", optarg);
685                                         return -EINVAL;
686                                 }
687
688                                 r = safe_atou16(split + 1, &container_port);
689                         } else {
690                                 r = safe_atou16(e, &container_port);
691                                 host_port = container_port;
692                         }
693
694                         if (r < 0 || container_port <= 0) {
695                                 log_error("Failed to parse host port: %s", optarg);
696                                 return -EINVAL;
697                         }
698
699                         LIST_FOREACH(ports, p, arg_expose_ports) {
700                                 if (p->protocol == protocol && p->host_port == host_port) {
701                                         log_error("Duplicate port specification: %s", optarg);
702                                         return -EINVAL;
703                                 }
704                         }
705
706                         p = new(ExposePort, 1);
707                         if (!p)
708                                 return log_oom();
709
710                         p->protocol = protocol;
711                         p->host_port = host_port;
712                         p->container_port = container_port;
713
714                         LIST_PREPEND(ports, arg_expose_ports, p);
715
716                         break;
717                 }
718
719                 case '?':
720                         return -EINVAL;
721
722                 default:
723                         assert_not_reached("Unhandled option");
724                 }
725
726         if (arg_share_system)
727                 arg_register = false;
728
729         if (arg_boot && arg_share_system) {
730                 log_error("--boot and --share-system may not be combined.");
731                 return -EINVAL;
732         }
733
734         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
735                 log_error("--keep-unit may not be used when invoked from a user session.");
736                 return -EINVAL;
737         }
738
739         if (arg_directory && arg_image) {
740                 log_error("--directory= and --image= may not be combined.");
741                 return -EINVAL;
742         }
743
744         if (arg_template && arg_image) {
745                 log_error("--template= and --image= may not be combined.");
746                 return -EINVAL;
747         }
748
749         if (arg_template && !(arg_directory || arg_machine)) {
750                 log_error("--template= needs --directory= or --machine=.");
751                 return -EINVAL;
752         }
753
754         if (arg_ephemeral && arg_template) {
755                 log_error("--ephemeral and --template= may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_ephemeral && arg_image) {
760                 log_error("--ephemeral and --image= may not be combined.");
761                 return -EINVAL;
762         }
763
764         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
765                 log_error("--ephemeral and --link-journal= may not be combined.");
766                 return -EINVAL;
767         }
768
769         if (arg_volatile != VOLATILE_NO && arg_read_only) {
770                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
771                 return -EINVAL;
772         }
773
774         if (arg_expose_ports && !arg_private_network) {
775                 log_error("Cannot use --port= without private networking.");
776                 return -EINVAL;
777         }
778
779         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
780
781         return 1;
782 }
783
784 static int mount_all(const char *dest) {
785
786         typedef struct MountPoint {
787                 const char *what;
788                 const char *where;
789                 const char *type;
790                 const char *options;
791                 unsigned long flags;
792                 bool fatal;
793         } MountPoint;
794
795         static const MountPoint mount_table[] = {
796                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
797                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
798                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
799                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
800                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
801                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
802                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
803                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
804 #ifdef HAVE_SELINUX
805                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
806                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
807 #endif
808         };
809
810         unsigned k;
811         int r = 0;
812
813         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
814                 _cleanup_free_ char *where = NULL;
815 #ifdef HAVE_SELINUX
816                 _cleanup_free_ char *options = NULL;
817 #endif
818                 const char *o;
819                 int t;
820
821                 where = strjoin(dest, "/", mount_table[k].where, NULL);
822                 if (!where)
823                         return log_oom();
824
825                 t = path_is_mount_point(where, true);
826                 if (t < 0) {
827                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
828
829                         if (r == 0)
830                                 r = t;
831
832                         continue;
833                 }
834
835                 /* Skip this entry if it is not a remount. */
836                 if (mount_table[k].what && t > 0)
837                         continue;
838
839                 t = mkdir_p(where, 0755);
840                 if (t < 0) {
841                         if (mount_table[k].fatal) {
842                                log_error_errno(t, "Failed to create directory %s: %m", where);
843
844                                 if (r == 0)
845                                         r = t;
846                         } else
847                                log_warning_errno(t, "Failed to create directory %s: %m", where);
848
849                         continue;
850                 }
851
852 #ifdef HAVE_SELINUX
853                 if (arg_selinux_apifs_context &&
854                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
855                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
856                         if (!options)
857                                 return log_oom();
858
859                         o = options;
860                 } else
861 #endif
862                         o = mount_table[k].options;
863
864
865                 if (mount(mount_table[k].what,
866                           where,
867                           mount_table[k].type,
868                           mount_table[k].flags,
869                           o) < 0) {
870
871                         if (mount_table[k].fatal) {
872                                 log_error_errno(errno, "mount(%s) failed: %m", where);
873
874                                 if (r == 0)
875                                         r = -errno;
876                         } else
877                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
878                 }
879         }
880
881         return r;
882 }
883
884 static int mount_binds(const char *dest, char **l, bool ro) {
885         char **x, **y;
886
887         STRV_FOREACH_PAIR(x, y, l) {
888                 _cleanup_free_ char *where = NULL;
889                 struct stat source_st, dest_st;
890                 int r;
891
892                 if (stat(*x, &source_st) < 0)
893                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
894
895                 where = strappend(dest, *y);
896                 if (!where)
897                         return log_oom();
898
899                 r = stat(where, &dest_st);
900                 if (r == 0) {
901                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
902                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
903                                 return -EINVAL;
904                         }
905                 } else if (errno == ENOENT) {
906                         r = mkdir_parents_label(where, 0755);
907                         if (r < 0)
908                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
909                 } else {
910                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
911                         return -errno;
912                 }
913
914                 /* Create the mount point, but be conservative -- refuse to create block
915                  * and char devices. */
916                 if (S_ISDIR(source_st.st_mode)) {
917                         r = mkdir_label(where, 0755);
918                         if (r < 0 && errno != EEXIST)
919                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
920                 } else if (S_ISFIFO(source_st.st_mode)) {
921                         r = mkfifo(where, 0644);
922                         if (r < 0 && errno != EEXIST)
923                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
924                 } else if (S_ISSOCK(source_st.st_mode)) {
925                         r = mknod(where, 0644 | S_IFSOCK, 0);
926                         if (r < 0 && errno != EEXIST)
927                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
928                 } else if (S_ISREG(source_st.st_mode)) {
929                         r = touch(where);
930                         if (r < 0)
931                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
932                 } else {
933                         log_error("Refusing to create mountpoint for file: %s", *x);
934                         return -ENOTSUP;
935                 }
936
937                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
938                         return log_error_errno(errno, "mount(%s) failed: %m", where);
939
940                 if (ro) {
941                         r = bind_remount_recursive(where, true);
942                         if (r < 0)
943                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
944                 }
945         }
946
947         return 0;
948 }
949
950 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
951         char *to;
952         int r;
953
954         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
955
956         r = path_is_mount_point(to, false);
957         if (r < 0)
958                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
959         if (r > 0)
960                 return 0;
961
962         mkdir_p(to, 0755);
963
964         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
965                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
966
967         return 1;
968 }
969
970 static int mount_cgroup(const char *dest) {
971         _cleanup_set_free_free_ Set *controllers = NULL;
972         _cleanup_free_ char *own_cgroup_path = NULL;
973         const char *cgroup_root, *systemd_root, *systemd_own;
974         int r;
975
976         controllers = set_new(&string_hash_ops);
977         if (!controllers)
978                 return log_oom();
979
980         r = cg_kernel_controllers(controllers);
981         if (r < 0)
982                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
983
984         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
985         if (r < 0)
986                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
987
988         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
989         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
990                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
991
992         for (;;) {
993                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
994
995                 controller = set_steal_first(controllers);
996                 if (!controller)
997                         break;
998
999                 origin = strappend("/sys/fs/cgroup/", controller);
1000                 if (!origin)
1001                         return log_oom();
1002
1003                 r = readlink_malloc(origin, &combined);
1004                 if (r == -EINVAL) {
1005                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1006
1007                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1008                         if (r < 0)
1009                                 return r;
1010
1011                 } else if (r < 0)
1012                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1013                 else {
1014                         _cleanup_free_ char *target = NULL;
1015
1016                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1017                         if (!target)
1018                                 return log_oom();
1019
1020                         /* A symbolic link, a combination of controllers in one hierarchy */
1021
1022                         if (!filename_is_valid(combined)) {
1023                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1024                                 continue;
1025                         }
1026
1027                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1028                         if (r < 0)
1029                                 return r;
1030
1031                         if (symlink(combined, target) < 0)
1032                                 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
1033                 }
1034         }
1035
1036         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
1037         if (r < 0)
1038                 return r;
1039
1040         /* Make our own cgroup a (writable) bind mount */
1041         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1042         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1043                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1044
1045         /* And then remount the systemd cgroup root read-only */
1046         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
1047         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1048                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1049
1050         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1051                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1052
1053         return 0;
1054 }
1055
1056 static int mount_tmpfs(const char *dest) {
1057         char **i, **o;
1058
1059         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1060                 _cleanup_free_ char *where = NULL;
1061                 int r;
1062
1063                 where = strappend(dest, *i);
1064                 if (!where)
1065                         return log_oom();
1066
1067                 r = mkdir_label(where, 0755);
1068                 if (r < 0 && r != -EEXIST)
1069                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1070
1071                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1072                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1073         }
1074
1075         return 0;
1076 }
1077
1078 static int setup_timezone(const char *dest) {
1079         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1080         char *z, *y;
1081         int r;
1082
1083         assert(dest);
1084
1085         /* Fix the timezone, if possible */
1086         r = readlink_malloc("/etc/localtime", &p);
1087         if (r < 0) {
1088                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1089                 return 0;
1090         }
1091
1092         z = path_startswith(p, "../usr/share/zoneinfo/");
1093         if (!z)
1094                 z = path_startswith(p, "/usr/share/zoneinfo/");
1095         if (!z) {
1096                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1097                 return 0;
1098         }
1099
1100         where = strappend(dest, "/etc/localtime");
1101         if (!where)
1102                 return log_oom();
1103
1104         r = readlink_malloc(where, &q);
1105         if (r >= 0) {
1106                 y = path_startswith(q, "../usr/share/zoneinfo/");
1107                 if (!y)
1108                         y = path_startswith(q, "/usr/share/zoneinfo/");
1109
1110                 /* Already pointing to the right place? Then do nothing .. */
1111                 if (y && streq(y, z))
1112                         return 0;
1113         }
1114
1115         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1116         if (!check)
1117                 return log_oom();
1118
1119         if (access(check, F_OK) < 0) {
1120                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1121                 return 0;
1122         }
1123
1124         what = strappend("../usr/share/zoneinfo/", z);
1125         if (!what)
1126                 return log_oom();
1127
1128         r = mkdir_parents(where, 0755);
1129         if (r < 0) {
1130                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1131
1132                 return 0;
1133         }
1134
1135         r = unlink(where);
1136         if (r < 0 && errno != ENOENT) {
1137                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1138
1139                 return 0;
1140         }
1141
1142         if (symlink(what, where) < 0) {
1143                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1144                 return 0;
1145         }
1146
1147         return 0;
1148 }
1149
1150 static int setup_resolv_conf(const char *dest) {
1151         _cleanup_free_ char *where = NULL;
1152         int r;
1153
1154         assert(dest);
1155
1156         if (arg_private_network)
1157                 return 0;
1158
1159         /* Fix resolv.conf, if possible */
1160         where = strappend(dest, "/etc/resolv.conf");
1161         if (!where)
1162                 return log_oom();
1163
1164         /* We don't really care for the results of this really. If it
1165          * fails, it fails, but meh... */
1166         r = mkdir_parents(where, 0755);
1167         if (r < 0) {
1168                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1169
1170                 return 0;
1171         }
1172
1173         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1174         if (r < 0) {
1175                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1176
1177                 return 0;
1178         }
1179
1180         return 0;
1181 }
1182
1183 static int setup_volatile_state(const char *directory) {
1184         const char *p;
1185         int r;
1186
1187         assert(directory);
1188
1189         if (arg_volatile != VOLATILE_STATE)
1190                 return 0;
1191
1192         /* --volatile=state means we simply overmount /var
1193            with a tmpfs, and the rest read-only. */
1194
1195         r = bind_remount_recursive(directory, true);
1196         if (r < 0)
1197                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1198
1199         p = strappenda(directory, "/var");
1200         r = mkdir(p, 0755);
1201         if (r < 0 && errno != EEXIST)
1202                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1203
1204         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1205                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1206
1207         return 0;
1208 }
1209
1210 static int setup_volatile(const char *directory) {
1211         bool tmpfs_mounted = false, bind_mounted = false;
1212         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1213         const char *f, *t;
1214         int r;
1215
1216         assert(directory);
1217
1218         if (arg_volatile != VOLATILE_YES)
1219                 return 0;
1220
1221         /* --volatile=yes means we mount a tmpfs to the root dir, and
1222            the original /usr to use inside it, and that read-only. */
1223
1224         if (!mkdtemp(template))
1225                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1226
1227         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1228                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1229                 r = -errno;
1230                 goto fail;
1231         }
1232
1233         tmpfs_mounted = true;
1234
1235         f = strappenda(directory, "/usr");
1236         t = strappenda(template, "/usr");
1237
1238         r = mkdir(t, 0755);
1239         if (r < 0 && errno != EEXIST) {
1240                 log_error_errno(errno, "Failed to create %s: %m", t);
1241                 r = -errno;
1242                 goto fail;
1243         }
1244
1245         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1246                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1247                 r = -errno;
1248                 goto fail;
1249         }
1250
1251         bind_mounted = true;
1252
1253         r = bind_remount_recursive(t, true);
1254         if (r < 0) {
1255                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1256                 goto fail;
1257         }
1258
1259         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1260                 log_error_errno(errno, "Failed to move root mount: %m");
1261                 r = -errno;
1262                 goto fail;
1263         }
1264
1265         rmdir(template);
1266
1267         return 0;
1268
1269 fail:
1270         if (bind_mounted)
1271                 umount(t);
1272         if (tmpfs_mounted)
1273                 umount(template);
1274         rmdir(template);
1275         return r;
1276 }
1277
1278 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1279
1280         snprintf(s, 37,
1281                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1282                  SD_ID128_FORMAT_VAL(id));
1283
1284         return s;
1285 }
1286
1287 static int setup_boot_id(const char *dest) {
1288         _cleanup_free_ char *from = NULL, *to = NULL;
1289         sd_id128_t rnd = {};
1290         char as_uuid[37];
1291         int r;
1292
1293         assert(dest);
1294
1295         if (arg_share_system)
1296                 return 0;
1297
1298         /* Generate a new randomized boot ID, so that each boot-up of
1299          * the container gets a new one */
1300
1301         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1302         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1303         if (!from || !to)
1304                 return log_oom();
1305
1306         r = sd_id128_randomize(&rnd);
1307         if (r < 0)
1308                 return log_error_errno(r, "Failed to generate random boot id: %m");
1309
1310         id128_format_as_uuid(rnd, as_uuid);
1311
1312         r = write_string_file(from, as_uuid);
1313         if (r < 0)
1314                 return log_error_errno(r, "Failed to write boot id: %m");
1315
1316         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1317                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1318                 r = -errno;
1319         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1320                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1321
1322         unlink(from);
1323         return r;
1324 }
1325
1326 static int copy_devnodes(const char *dest) {
1327
1328         static const char devnodes[] =
1329                 "null\0"
1330                 "zero\0"
1331                 "full\0"
1332                 "random\0"
1333                 "urandom\0"
1334                 "tty\0"
1335                 "net/tun\0";
1336
1337         const char *d;
1338         int r = 0;
1339         _cleanup_umask_ mode_t u;
1340
1341         assert(dest);
1342
1343         u = umask(0000);
1344
1345         NULSTR_FOREACH(d, devnodes) {
1346                 _cleanup_free_ char *from = NULL, *to = NULL;
1347                 struct stat st;
1348
1349                 from = strappend("/dev/", d);
1350                 to = strjoin(dest, "/dev/", d, NULL);
1351                 if (!from || !to)
1352                         return log_oom();
1353
1354                 if (stat(from, &st) < 0) {
1355
1356                         if (errno != ENOENT)
1357                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1358
1359                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1360
1361                         log_error("%s is not a char or block device, cannot copy", from);
1362                         return -EIO;
1363
1364                 } else {
1365                         r = mkdir_parents(to, 0775);
1366                         if (r < 0) {
1367                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1368                                 return -r;
1369                         }
1370
1371                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1372                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1373                 }
1374         }
1375
1376         return r;
1377 }
1378
1379 static int setup_ptmx(const char *dest) {
1380         _cleanup_free_ char *p = NULL;
1381
1382         p = strappend(dest, "/dev/ptmx");
1383         if (!p)
1384                 return log_oom();
1385
1386         if (symlink("pts/ptmx", p) < 0)
1387                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1388
1389         return 0;
1390 }
1391
1392 static int setup_dev_console(const char *dest, const char *console) {
1393         _cleanup_umask_ mode_t u;
1394         const char *to;
1395         struct stat st;
1396         int r;
1397
1398         assert(dest);
1399         assert(console);
1400
1401         u = umask(0000);
1402
1403         if (stat("/dev/null", &st) < 0)
1404                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1405
1406         r = chmod_and_chown(console, 0600, 0, 0);
1407         if (r < 0)
1408                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1409
1410         /* We need to bind mount the right tty to /dev/console since
1411          * ptys can only exist on pts file systems. To have something
1412          * to bind mount things on we create a device node first, and
1413          * use /dev/null for that since we the cgroups device policy
1414          * allows us to create that freely, while we cannot create
1415          * /dev/console. (Note that the major minor doesn't actually
1416          * matter here, since we mount it over anyway). */
1417
1418         to = strappenda(dest, "/dev/console");
1419         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1420                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1421
1422         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1423                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1424
1425         return 0;
1426 }
1427
1428 static int setup_kmsg(const char *dest, int kmsg_socket) {
1429         _cleanup_free_ char *from = NULL, *to = NULL;
1430         _cleanup_umask_ mode_t u;
1431         int r, fd, k;
1432         union {
1433                 struct cmsghdr cmsghdr;
1434                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1435         } control = {};
1436         struct msghdr mh = {
1437                 .msg_control = &control,
1438                 .msg_controllen = sizeof(control),
1439         };
1440         struct cmsghdr *cmsg;
1441
1442         assert(dest);
1443         assert(kmsg_socket >= 0);
1444
1445         u = umask(0000);
1446
1447         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1448          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1449          * on the reading side behave very similar to /proc/kmsg,
1450          * their writing side behaves differently from /dev/kmsg in
1451          * that writing blocks when nothing is reading. In order to
1452          * avoid any problems with containers deadlocking due to this
1453          * we simply make /dev/kmsg unavailable to the container. */
1454         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1455             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1456                 return log_oom();
1457
1458         if (mkfifo(from, 0600) < 0)
1459                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1460
1461         r = chmod_and_chown(from, 0600, 0, 0);
1462         if (r < 0)
1463                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1464
1465         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1466                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1467
1468         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1469         if (fd < 0)
1470                 return log_error_errno(errno, "Failed to open fifo: %m");
1471
1472         cmsg = CMSG_FIRSTHDR(&mh);
1473         cmsg->cmsg_level = SOL_SOCKET;
1474         cmsg->cmsg_type = SCM_RIGHTS;
1475         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1476         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1477
1478         mh.msg_controllen = cmsg->cmsg_len;
1479
1480         /* Store away the fd in the socket, so that it stays open as
1481          * long as we run the child */
1482         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1483         safe_close(fd);
1484
1485         if (k < 0)
1486                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1487
1488         /* And now make the FIFO unavailable as /dev/kmsg... */
1489         unlink(from);
1490         return 0;
1491 }
1492
1493 static int send_rtnl(int send_fd) {
1494         union {
1495                 struct cmsghdr cmsghdr;
1496                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1497         } control = {};
1498         struct msghdr mh = {
1499                 .msg_control = &control,
1500                 .msg_controllen = sizeof(control),
1501         };
1502         struct cmsghdr *cmsg;
1503         _cleanup_close_ int fd = -1;
1504         ssize_t k;
1505
1506         assert(send_fd >= 0);
1507
1508         if (!arg_expose_ports)
1509                 return 0;
1510
1511         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1512         if (fd < 0)
1513                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1514
1515         cmsg = CMSG_FIRSTHDR(&mh);
1516         cmsg->cmsg_level = SOL_SOCKET;
1517         cmsg->cmsg_type = SCM_RIGHTS;
1518         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1519         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1520
1521         mh.msg_controllen = cmsg->cmsg_len;
1522
1523         /* Store away the fd in the socket, so that it stays open as
1524          * long as we run the child */
1525         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1526         if (k < 0)
1527                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1528
1529         return 0;
1530 }
1531
1532 static int flush_ports(union in_addr_union *exposed) {
1533         ExposePort *p;
1534         int r, af = AF_INET;
1535
1536         assert(exposed);
1537
1538         if (!arg_expose_ports)
1539                 return 0;
1540
1541         if (in_addr_is_null(af, exposed))
1542                 return 0;
1543
1544         log_debug("Lost IP address.");
1545
1546         LIST_FOREACH(ports, p, arg_expose_ports) {
1547                 r = fw_add_local_dnat(false,
1548                                       af,
1549                                       p->protocol,
1550                                       NULL,
1551                                       NULL, 0,
1552                                       NULL, 0,
1553                                       p->host_port,
1554                                       exposed,
1555                                       p->container_port,
1556                                       NULL);
1557                 if (r < 0)
1558                         log_warning_errno(r, "Failed to modify firewall: %m");
1559         }
1560
1561         *exposed = IN_ADDR_NULL;
1562         return 0;
1563 }
1564
1565 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1566         _cleanup_free_ struct local_address *addresses = NULL;
1567         _cleanup_free_ char *pretty = NULL;
1568         union in_addr_union new_exposed;
1569         ExposePort *p;
1570         bool add;
1571         int af = AF_INET, r;
1572
1573         assert(exposed);
1574
1575         /* Invoked each time an address is added or removed inside the
1576          * container */
1577
1578         if (!arg_expose_ports)
1579                 return 0;
1580
1581         r = local_addresses(rtnl, 0, af, &addresses);
1582         if (r < 0)
1583                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1584
1585         add = r > 0 &&
1586                 addresses[0].family == af &&
1587                 addresses[0].scope < RT_SCOPE_LINK;
1588
1589         if (!add)
1590                 return flush_ports(exposed);
1591
1592         new_exposed = addresses[0].address;
1593         if (in_addr_equal(af, exposed, &new_exposed))
1594                 return 0;
1595
1596         in_addr_to_string(af, &new_exposed, &pretty);
1597         log_debug("New container IP is %s.", strna(pretty));
1598
1599         LIST_FOREACH(ports, p, arg_expose_ports) {
1600
1601                 r = fw_add_local_dnat(true,
1602                                       af,
1603                                       p->protocol,
1604                                       NULL,
1605                                       NULL, 0,
1606                                       NULL, 0,
1607                                       p->host_port,
1608                                       &new_exposed,
1609                                       p->container_port,
1610                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1611                 if (r < 0)
1612                         log_warning_errno(r, "Failed to modify firewall: %m");
1613         }
1614
1615         *exposed = new_exposed;
1616         return 0;
1617 }
1618
1619 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1620         union in_addr_union *exposed = userdata;
1621
1622         assert(rtnl);
1623         assert(m);
1624         assert(exposed);
1625
1626         expose_ports(rtnl, exposed);
1627         return 0;
1628 }
1629
1630 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1631         union {
1632                 struct cmsghdr cmsghdr;
1633                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1634         } control = {};
1635         struct msghdr mh = {
1636                 .msg_control = &control,
1637                 .msg_controllen = sizeof(control),
1638         };
1639         struct cmsghdr *cmsg;
1640         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1641         int fd, r;
1642         ssize_t k;
1643
1644         assert(event);
1645         assert(recv_fd >= 0);
1646         assert(ret);
1647
1648         if (!arg_expose_ports)
1649                 return 0;
1650
1651         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1652         if (k < 0)
1653                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1654
1655         cmsg = CMSG_FIRSTHDR(&mh);
1656         assert(cmsg->cmsg_level == SOL_SOCKET);
1657         assert(cmsg->cmsg_type == SCM_RIGHTS);
1658         assert(cmsg->cmsg_len = CMSG_LEN(sizeof(int)));
1659         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1660
1661         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1662         if (r < 0) {
1663                 safe_close(fd);
1664                 return log_error_errno(r, "Failed to create rtnl object: %m");
1665         }
1666
1667         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1668         if (r < 0)
1669                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1670
1671         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1672         if (r < 0)
1673                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1674
1675         r = sd_rtnl_attach_event(rtnl, event, 0);
1676         if (r < 0)
1677                 return log_error_errno(r, "Failed to add to even loop: %m");
1678
1679         *ret = rtnl;
1680         rtnl = NULL;
1681
1682         return 0;
1683 }
1684
1685 static int setup_hostname(void) {
1686
1687         if (arg_share_system)
1688                 return 0;
1689
1690         if (sethostname_idempotent(arg_machine) < 0)
1691                 return -errno;
1692
1693         return 0;
1694 }
1695
1696 static int setup_journal(const char *directory) {
1697         sd_id128_t machine_id, this_id;
1698         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1699         char *id;
1700         int r;
1701
1702         /* Don't link journals in ephemeral mode */
1703         if (arg_ephemeral)
1704                 return 0;
1705
1706         p = strappend(directory, "/etc/machine-id");
1707         if (!p)
1708                 return log_oom();
1709
1710         r = read_one_line_file(p, &b);
1711         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1712                 return 0;
1713         else if (r < 0)
1714                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1715
1716         id = strstrip(b);
1717         if (isempty(id) && arg_link_journal == LINK_AUTO)
1718                 return 0;
1719
1720         /* Verify validity */
1721         r = sd_id128_from_string(id, &machine_id);
1722         if (r < 0)
1723                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1724
1725         r = sd_id128_get_machine(&this_id);
1726         if (r < 0)
1727                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1728
1729         if (sd_id128_equal(machine_id, this_id)) {
1730                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1731                          "Host and machine ids are equal (%s): refusing to link journals", id);
1732                 if (arg_link_journal == LINK_AUTO)
1733                         return 0;
1734                 return -EEXIST;
1735         }
1736
1737         if (arg_link_journal == LINK_NO)
1738                 return 0;
1739
1740         free(p);
1741         p = strappend("/var/log/journal/", id);
1742         q = strjoin(directory, "/var/log/journal/", id, NULL);
1743         if (!p || !q)
1744                 return log_oom();
1745
1746         if (path_is_mount_point(p, false) > 0) {
1747                 if (arg_link_journal != LINK_AUTO) {
1748                         log_error("%s: already a mount point, refusing to use for journal", p);
1749                         return -EEXIST;
1750                 }
1751
1752                 return 0;
1753         }
1754
1755         if (path_is_mount_point(q, false) > 0) {
1756                 if (arg_link_journal != LINK_AUTO) {
1757                         log_error("%s: already a mount point, refusing to use for journal", q);
1758                         return -EEXIST;
1759                 }
1760
1761                 return 0;
1762         }
1763
1764         r = readlink_and_make_absolute(p, &d);
1765         if (r >= 0) {
1766                 if ((arg_link_journal == LINK_GUEST ||
1767                      arg_link_journal == LINK_AUTO) &&
1768                     path_equal(d, q)) {
1769
1770                         r = mkdir_p(q, 0755);
1771                         if (r < 0)
1772                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1773                         return 0;
1774                 }
1775
1776                 if (unlink(p) < 0)
1777                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1778         } else if (r == -EINVAL) {
1779
1780                 if (arg_link_journal == LINK_GUEST &&
1781                     rmdir(p) < 0) {
1782
1783                         if (errno == ENOTDIR) {
1784                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1785                                 return r;
1786                         } else {
1787                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1788                                 return -errno;
1789                         }
1790                 }
1791         } else if (r != -ENOENT) {
1792                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1793                 return r;
1794         }
1795
1796         if (arg_link_journal == LINK_GUEST) {
1797
1798                 if (symlink(q, p) < 0) {
1799                         if (arg_link_journal_try) {
1800                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1801                                 return 0;
1802                         } else {
1803                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1804                                 return -errno;
1805                         }
1806                 }
1807
1808                 r = mkdir_p(q, 0755);
1809                 if (r < 0)
1810                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1811                 return 0;
1812         }
1813
1814         if (arg_link_journal == LINK_HOST) {
1815                 /* don't create parents here -- if the host doesn't have
1816                  * permanent journal set up, don't force it here */
1817                 r = mkdir(p, 0755);
1818                 if (r < 0) {
1819                         if (arg_link_journal_try) {
1820                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1821                                 return 0;
1822                         } else {
1823                                 log_error_errno(errno, "Failed to create %s: %m", p);
1824                                 return r;
1825                         }
1826                 }
1827
1828         } else if (access(p, F_OK) < 0)
1829                 return 0;
1830
1831         if (dir_is_empty(q) == 0)
1832                 log_warning("%s is not empty, proceeding anyway.", q);
1833
1834         r = mkdir_p(q, 0755);
1835         if (r < 0) {
1836                 log_error_errno(errno, "Failed to create %s: %m", q);
1837                 return r;
1838         }
1839
1840         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1841                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1842
1843         return 0;
1844 }
1845
1846 static int drop_capabilities(void) {
1847         return capability_bounding_set_drop(~arg_retain, false);
1848 }
1849
1850 static int register_machine(pid_t pid, int local_ifindex) {
1851         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1852         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1853         int r;
1854
1855         if (!arg_register)
1856                 return 0;
1857
1858         r = sd_bus_default_system(&bus);
1859         if (r < 0)
1860                 return log_error_errno(r, "Failed to open system bus: %m");
1861
1862         if (arg_keep_unit) {
1863                 r = sd_bus_call_method(
1864                                 bus,
1865                                 "org.freedesktop.machine1",
1866                                 "/org/freedesktop/machine1",
1867                                 "org.freedesktop.machine1.Manager",
1868                                 "RegisterMachineWithNetwork",
1869                                 &error,
1870                                 NULL,
1871                                 "sayssusai",
1872                                 arg_machine,
1873                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1874                                 "nspawn",
1875                                 "container",
1876                                 (uint32_t) pid,
1877                                 strempty(arg_directory),
1878                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1879         } else {
1880                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1881
1882                 r = sd_bus_message_new_method_call(
1883                                 bus,
1884                                 &m,
1885                                 "org.freedesktop.machine1",
1886                                 "/org/freedesktop/machine1",
1887                                 "org.freedesktop.machine1.Manager",
1888                                 "CreateMachineWithNetwork");
1889                 if (r < 0)
1890                         return log_error_errno(r, "Failed to create message: %m");
1891
1892                 r = sd_bus_message_append(
1893                                 m,
1894                                 "sayssusai",
1895                                 arg_machine,
1896                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1897                                 "nspawn",
1898                                 "container",
1899                                 (uint32_t) pid,
1900                                 strempty(arg_directory),
1901                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1902                 if (r < 0)
1903                         return log_error_errno(r, "Failed to append message arguments: %m");
1904
1905                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1906                 if (r < 0)
1907                         return log_error_errno(r, "Failed to open container: %m");
1908
1909                 if (!isempty(arg_slice)) {
1910                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1911                         if (r < 0)
1912                                 return log_error_errno(r, "Failed to append slice: %m");
1913                 }
1914
1915                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1916                 if (r < 0)
1917                         return log_error_errno(r, "Failed to add device policy: %m");
1918
1919                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1920                                           /* Allow the container to
1921                                            * access and create the API
1922                                            * device nodes, so that
1923                                            * PrivateDevices= in the
1924                                            * container can work
1925                                            * fine */
1926                                           "/dev/null", "rwm",
1927                                           "/dev/zero", "rwm",
1928                                           "/dev/full", "rwm",
1929                                           "/dev/random", "rwm",
1930                                           "/dev/urandom", "rwm",
1931                                           "/dev/tty", "rwm",
1932                                           "/dev/net/tun", "rwm",
1933                                           /* Allow the container
1934                                            * access to ptys. However,
1935                                            * do not permit the
1936                                            * container to ever create
1937                                            * these device nodes. */
1938                                           "/dev/pts/ptmx", "rw",
1939                                           "char-pts", "rw");
1940                 if (r < 0)
1941                         return log_error_errno(r, "Failed to add device whitelist: %m");
1942
1943                 r = sd_bus_message_close_container(m);
1944                 if (r < 0)
1945                         return log_error_errno(r, "Failed to close container: %m");
1946
1947                 r = sd_bus_call(bus, m, 0, &error, NULL);
1948         }
1949
1950         if (r < 0) {
1951                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1952                 return r;
1953         }
1954
1955         return 0;
1956 }
1957
1958 static int terminate_machine(pid_t pid) {
1959         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1960         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1961         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1962         const char *path;
1963         int r;
1964
1965         if (!arg_register)
1966                 return 0;
1967
1968         r = sd_bus_default_system(&bus);
1969         if (r < 0)
1970                 return log_error_errno(r, "Failed to open system bus: %m");
1971
1972         r = sd_bus_call_method(
1973                         bus,
1974                         "org.freedesktop.machine1",
1975                         "/org/freedesktop/machine1",
1976                         "org.freedesktop.machine1.Manager",
1977                         "GetMachineByPID",
1978                         &error,
1979                         &reply,
1980                         "u",
1981                         (uint32_t) pid);
1982         if (r < 0) {
1983                 /* Note that the machine might already have been
1984                  * cleaned up automatically, hence don't consider it a
1985                  * failure if we cannot get the machine object. */
1986                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1987                 return 0;
1988         }
1989
1990         r = sd_bus_message_read(reply, "o", &path);
1991         if (r < 0)
1992                 return bus_log_parse_error(r);
1993
1994         r = sd_bus_call_method(
1995                         bus,
1996                         "org.freedesktop.machine1",
1997                         path,
1998                         "org.freedesktop.machine1.Machine",
1999                         "Terminate",
2000                         &error,
2001                         NULL,
2002                         NULL);
2003         if (r < 0) {
2004                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2005                 return 0;
2006         }
2007
2008         return 0;
2009 }
2010
2011 static int reset_audit_loginuid(void) {
2012         _cleanup_free_ char *p = NULL;
2013         int r;
2014
2015         if (arg_share_system)
2016                 return 0;
2017
2018         r = read_one_line_file("/proc/self/loginuid", &p);
2019         if (r == -ENOENT)
2020                 return 0;
2021         if (r < 0)
2022                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2023
2024         /* Already reset? */
2025         if (streq(p, "4294967295"))
2026                 return 0;
2027
2028         r = write_string_file("/proc/self/loginuid", "4294967295");
2029         if (r < 0) {
2030                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2031                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2032                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2033                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2034                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2035
2036                 sleep(5);
2037         }
2038
2039         return 0;
2040 }
2041
2042 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2043 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2044 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2045
2046 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2047         uint8_t result[8];
2048         size_t l, sz;
2049         uint8_t *v, *i;
2050         int r;
2051
2052         l = strlen(arg_machine);
2053         sz = sizeof(sd_id128_t) + l;
2054         if (idx > 0)
2055                 sz += sizeof(idx);
2056
2057         v = alloca(sz);
2058
2059         /* fetch some persistent data unique to the host */
2060         r = sd_id128_get_machine((sd_id128_t*) v);
2061         if (r < 0)
2062                 return r;
2063
2064         /* combine with some data unique (on this host) to this
2065          * container instance */
2066         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2067         if (idx > 0) {
2068                 idx = htole64(idx);
2069                 memcpy(i, &idx, sizeof(idx));
2070         }
2071
2072         /* Let's hash the host machine ID plus the container name. We
2073          * use a fixed, but originally randomly created hash key here. */
2074         siphash24(result, v, sz, hash_key.bytes);
2075
2076         assert_cc(ETH_ALEN <= sizeof(result));
2077         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2078
2079         /* see eth_random_addr in the kernel */
2080         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2081         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2082
2083         return 0;
2084 }
2085
2086 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2087         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2088         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2089         struct ether_addr mac_host, mac_container;
2090         int r, i;
2091
2092         if (!arg_private_network)
2093                 return 0;
2094
2095         if (!arg_network_veth)
2096                 return 0;
2097
2098         /* Use two different interface name prefixes depending whether
2099          * we are in bridge mode or not. */
2100         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2101                  arg_network_bridge ? "vb" : "ve", arg_machine);
2102
2103         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2104         if (r < 0)
2105                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2106
2107         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2108         if (r < 0)
2109                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2110
2111         r = sd_rtnl_open(&rtnl, 0);
2112         if (r < 0)
2113                 return log_error_errno(r, "Failed to connect to netlink: %m");
2114
2115         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2116         if (r < 0)
2117                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2118
2119         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2120         if (r < 0)
2121                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2122
2123         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2124         if (r < 0)
2125                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2126
2127         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2128         if (r < 0)
2129                 return log_error_errno(r, "Failed to open netlink container: %m");
2130
2131         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2132         if (r < 0)
2133                 return log_error_errno(r, "Failed to open netlink container: %m");
2134
2135         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2136         if (r < 0)
2137                 return log_error_errno(r, "Failed to open netlink container: %m");
2138
2139         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2140         if (r < 0)
2141                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2142
2143         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2144         if (r < 0)
2145                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2146
2147         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2148         if (r < 0)
2149                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2150
2151         r = sd_rtnl_message_close_container(m);
2152         if (r < 0)
2153                 return log_error_errno(r, "Failed to close netlink container: %m");
2154
2155         r = sd_rtnl_message_close_container(m);
2156         if (r < 0)
2157                 return log_error_errno(r, "Failed to close netlink container: %m");
2158
2159         r = sd_rtnl_message_close_container(m);
2160         if (r < 0)
2161                 return log_error_errno(r, "Failed to close netlink container: %m");
2162
2163         r = sd_rtnl_call(rtnl, m, 0, NULL);
2164         if (r < 0)
2165                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2166
2167         i = (int) if_nametoindex(iface_name);
2168         if (i <= 0)
2169                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2170
2171         *ifi = i;
2172
2173         return 0;
2174 }
2175
2176 static int setup_bridge(const char veth_name[], int *ifi) {
2177         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2178         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2179         int r, bridge;
2180
2181         if (!arg_private_network)
2182                 return 0;
2183
2184         if (!arg_network_veth)
2185                 return 0;
2186
2187         if (!arg_network_bridge)
2188                 return 0;
2189
2190         bridge = (int) if_nametoindex(arg_network_bridge);
2191         if (bridge <= 0)
2192                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2193
2194         *ifi = bridge;
2195
2196         r = sd_rtnl_open(&rtnl, 0);
2197         if (r < 0)
2198                 return log_error_errno(r, "Failed to connect to netlink: %m");
2199
2200         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2201         if (r < 0)
2202                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2203
2204         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2205         if (r < 0)
2206                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2207
2208         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2209         if (r < 0)
2210                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2211
2212         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2213         if (r < 0)
2214                 return log_error_errno(r, "Failed to add netlink master field: %m");
2215
2216         r = sd_rtnl_call(rtnl, m, 0, NULL);
2217         if (r < 0)
2218                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2219
2220         return 0;
2221 }
2222
2223 static int parse_interface(struct udev *udev, const char *name) {
2224         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2225         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2226         int ifi;
2227
2228         ifi = (int) if_nametoindex(name);
2229         if (ifi <= 0)
2230                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2231
2232         sprintf(ifi_str, "n%i", ifi);
2233         d = udev_device_new_from_device_id(udev, ifi_str);
2234         if (!d)
2235                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2236
2237         if (udev_device_get_is_initialized(d) <= 0) {
2238                 log_error("Network interface %s is not initialized yet.", name);
2239                 return -EBUSY;
2240         }
2241
2242         return ifi;
2243 }
2244
2245 static int move_network_interfaces(pid_t pid) {
2246         _cleanup_udev_unref_ struct udev *udev = NULL;
2247         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2248         char **i;
2249         int r;
2250
2251         if (!arg_private_network)
2252                 return 0;
2253
2254         if (strv_isempty(arg_network_interfaces))
2255                 return 0;
2256
2257         r = sd_rtnl_open(&rtnl, 0);
2258         if (r < 0)
2259                 return log_error_errno(r, "Failed to connect to netlink: %m");
2260
2261         udev = udev_new();
2262         if (!udev) {
2263                 log_error("Failed to connect to udev.");
2264                 return -ENOMEM;
2265         }
2266
2267         STRV_FOREACH(i, arg_network_interfaces) {
2268                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2269                 int ifi;
2270
2271                 ifi = parse_interface(udev, *i);
2272                 if (ifi < 0)
2273                         return ifi;
2274
2275                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2276                 if (r < 0)
2277                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2278
2279                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2280                 if (r < 0)
2281                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2282
2283                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2284                 if (r < 0)
2285                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2286         }
2287
2288         return 0;
2289 }
2290
2291 static int setup_macvlan(pid_t pid) {
2292         _cleanup_udev_unref_ struct udev *udev = NULL;
2293         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2294         unsigned idx = 0;
2295         char **i;
2296         int r;
2297
2298         if (!arg_private_network)
2299                 return 0;
2300
2301         if (strv_isempty(arg_network_macvlan))
2302                 return 0;
2303
2304         r = sd_rtnl_open(&rtnl, 0);
2305         if (r < 0)
2306                 return log_error_errno(r, "Failed to connect to netlink: %m");
2307
2308         udev = udev_new();
2309         if (!udev) {
2310                 log_error("Failed to connect to udev.");
2311                 return -ENOMEM;
2312         }
2313
2314         STRV_FOREACH(i, arg_network_macvlan) {
2315                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2316                 _cleanup_free_ char *n = NULL;
2317                 struct ether_addr mac;
2318                 int ifi;
2319
2320                 ifi = parse_interface(udev, *i);
2321                 if (ifi < 0)
2322                         return ifi;
2323
2324                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2325                 if (r < 0)
2326                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2327
2328                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2329                 if (r < 0)
2330                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2331
2332                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2333                 if (r < 0)
2334                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2335
2336                 n = strappend("mv-", *i);
2337                 if (!n)
2338                         return log_oom();
2339
2340                 strshorten(n, IFNAMSIZ-1);
2341
2342                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2343                 if (r < 0)
2344                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2345
2346                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2347                 if (r < 0)
2348                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2349
2350                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2351                 if (r < 0)
2352                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2353
2354                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2355                 if (r < 0)
2356                         return log_error_errno(r, "Failed to open netlink container: %m");
2357
2358                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2359                 if (r < 0)
2360                         return log_error_errno(r, "Failed to open netlink container: %m");
2361
2362                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2363                 if (r < 0)
2364                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2365
2366                 r = sd_rtnl_message_close_container(m);
2367                 if (r < 0)
2368                         return log_error_errno(r, "Failed to close netlink container: %m");
2369
2370                 r = sd_rtnl_message_close_container(m);
2371                 if (r < 0)
2372                         return log_error_errno(r, "Failed to close netlink container: %m");
2373
2374                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2375                 if (r < 0)
2376                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2377         }
2378
2379         return 0;
2380 }
2381
2382 static int setup_seccomp(void) {
2383
2384 #ifdef HAVE_SECCOMP
2385         static const int blacklist[] = {
2386                 SCMP_SYS(kexec_load),
2387                 SCMP_SYS(open_by_handle_at),
2388                 SCMP_SYS(init_module),
2389                 SCMP_SYS(finit_module),
2390                 SCMP_SYS(delete_module),
2391                 SCMP_SYS(iopl),
2392                 SCMP_SYS(ioperm),
2393                 SCMP_SYS(swapon),
2394                 SCMP_SYS(swapoff),
2395         };
2396
2397         scmp_filter_ctx seccomp;
2398         unsigned i;
2399         int r;
2400
2401         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2402         if (!seccomp)
2403                 return log_oom();
2404
2405         r = seccomp_add_secondary_archs(seccomp);
2406         if (r < 0) {
2407                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2408                 goto finish;
2409         }
2410
2411         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2412                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2413                 if (r == -EFAULT)
2414                         continue; /* unknown syscall */
2415                 if (r < 0) {
2416                         log_error_errno(r, "Failed to block syscall: %m");
2417                         goto finish;
2418                 }
2419         }
2420
2421         /*
2422            Audit is broken in containers, much of the userspace audit
2423            hookup will fail if running inside a container. We don't
2424            care and just turn off creation of audit sockets.
2425
2426            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2427            with EAFNOSUPPORT which audit userspace uses as indication
2428            that audit is disabled in the kernel.
2429          */
2430
2431         r = seccomp_rule_add(
2432                         seccomp,
2433                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2434                         SCMP_SYS(socket),
2435                         2,
2436                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2437                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2438         if (r < 0) {
2439                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2440                 goto finish;
2441         }
2442
2443         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2444         if (r < 0) {
2445                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2446                 goto finish;
2447         }
2448
2449         r = seccomp_load(seccomp);
2450         if (r < 0)
2451                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2452
2453 finish:
2454         seccomp_release(seccomp);
2455         return r;
2456 #else
2457         return 0;
2458 #endif
2459
2460 }
2461
2462 static int setup_propagate(const char *root) {
2463         const char *p, *q;
2464
2465         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2466         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2467         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2468         (void) mkdir_p(p, 0600);
2469
2470         q = strappenda(root, "/run/systemd/nspawn/incoming");
2471         mkdir_parents(q, 0755);
2472         mkdir_p(q, 0600);
2473
2474         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2475                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2476
2477         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2478                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2479
2480         return 0;
2481 }
2482
2483 static int setup_image(char **device_path, int *loop_nr) {
2484         struct loop_info64 info = {
2485                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2486         };
2487         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2488         _cleanup_free_ char* loopdev = NULL;
2489         struct stat st;
2490         int r, nr;
2491
2492         assert(device_path);
2493         assert(loop_nr);
2494         assert(arg_image);
2495
2496         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2497         if (fd < 0)
2498                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2499
2500         if (fstat(fd, &st) < 0)
2501                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2502
2503         if (S_ISBLK(st.st_mode)) {
2504                 char *p;
2505
2506                 p = strdup(arg_image);
2507                 if (!p)
2508                         return log_oom();
2509
2510                 *device_path = p;
2511
2512                 *loop_nr = -1;
2513
2514                 r = fd;
2515                 fd = -1;
2516
2517                 return r;
2518         }
2519
2520         if (!S_ISREG(st.st_mode)) {
2521                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2522                 return -EINVAL;
2523         }
2524
2525         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2526         if (control < 0)
2527                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2528
2529         nr = ioctl(control, LOOP_CTL_GET_FREE);
2530         if (nr < 0)
2531                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2532
2533         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2534                 return log_oom();
2535
2536         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2537         if (loop < 0)
2538                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2539
2540         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2541                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2542
2543         if (arg_read_only)
2544                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2545
2546         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2547                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2548
2549         *device_path = loopdev;
2550         loopdev = NULL;
2551
2552         *loop_nr = nr;
2553
2554         r = loop;
2555         loop = -1;
2556
2557         return r;
2558 }
2559
2560 static int dissect_image(
2561                 int fd,
2562                 char **root_device, bool *root_device_rw,
2563                 char **home_device, bool *home_device_rw,
2564                 char **srv_device, bool *srv_device_rw,
2565                 bool *secondary) {
2566
2567 #ifdef HAVE_BLKID
2568         int home_nr = -1, srv_nr = -1;
2569 #ifdef GPT_ROOT_NATIVE
2570         int root_nr = -1;
2571 #endif
2572 #ifdef GPT_ROOT_SECONDARY
2573         int secondary_root_nr = -1;
2574 #endif
2575
2576         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2577         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2578         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2579         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2580         _cleanup_udev_unref_ struct udev *udev = NULL;
2581         struct udev_list_entry *first, *item;
2582         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2583         const char *pttype = NULL;
2584         blkid_partlist pl;
2585         struct stat st;
2586         int r;
2587
2588         assert(fd >= 0);
2589         assert(root_device);
2590         assert(home_device);
2591         assert(srv_device);
2592         assert(secondary);
2593         assert(arg_image);
2594
2595         b = blkid_new_probe();
2596         if (!b)
2597                 return log_oom();
2598
2599         errno = 0;
2600         r = blkid_probe_set_device(b, fd, 0, 0);
2601         if (r != 0) {
2602                 if (errno == 0)
2603                         return log_oom();
2604
2605                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2606                 return -errno;
2607         }
2608
2609         blkid_probe_enable_partitions(b, 1);
2610         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2611
2612         errno = 0;
2613         r = blkid_do_safeprobe(b);
2614         if (r == -2 || r == 1) {
2615                 log_error("Failed to identify any partition table on %s.\n"
2616                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2617                 return -EINVAL;
2618         } else if (r != 0) {
2619                 if (errno == 0)
2620                         errno = EIO;
2621                 log_error_errno(errno, "Failed to probe: %m");
2622                 return -errno;
2623         }
2624
2625         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2626         if (!streq_ptr(pttype, "gpt")) {
2627                 log_error("Image %s does not carry a GUID Partition Table.\n"
2628                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2629                 return -EINVAL;
2630         }
2631
2632         errno = 0;
2633         pl = blkid_probe_get_partitions(b);
2634         if (!pl) {
2635                 if (errno == 0)
2636                         return log_oom();
2637
2638                 log_error("Failed to list partitions of %s", arg_image);
2639                 return -errno;
2640         }
2641
2642         udev = udev_new();
2643         if (!udev)
2644                 return log_oom();
2645
2646         if (fstat(fd, &st) < 0)
2647                 return log_error_errno(errno, "Failed to stat block device: %m");
2648
2649         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2650         if (!d)
2651                 return log_oom();
2652
2653         e = udev_enumerate_new(udev);
2654         if (!e)
2655                 return log_oom();
2656
2657         r = udev_enumerate_add_match_parent(e, d);
2658         if (r < 0)
2659                 return log_oom();
2660
2661         r = udev_enumerate_scan_devices(e);
2662         if (r < 0)
2663                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2664
2665         first = udev_enumerate_get_list_entry(e);
2666         udev_list_entry_foreach(item, first) {
2667                 _cleanup_udev_device_unref_ struct udev_device *q;
2668                 const char *stype, *node;
2669                 unsigned long long flags;
2670                 sd_id128_t type_id;
2671                 blkid_partition pp;
2672                 dev_t qn;
2673                 int nr;
2674
2675                 errno = 0;
2676                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2677                 if (!q) {
2678                         if (!errno)
2679                                 errno = ENOMEM;
2680
2681                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2682                         return -errno;
2683                 }
2684
2685                 qn = udev_device_get_devnum(q);
2686                 if (major(qn) == 0)
2687                         continue;
2688
2689                 if (st.st_rdev == qn)
2690                         continue;
2691
2692                 node = udev_device_get_devnode(q);
2693                 if (!node)
2694                         continue;
2695
2696                 pp = blkid_partlist_devno_to_partition(pl, qn);
2697                 if (!pp)
2698                         continue;
2699
2700                 flags = blkid_partition_get_flags(pp);
2701                 if (flags & GPT_FLAG_NO_AUTO)
2702                         continue;
2703
2704                 nr = blkid_partition_get_partno(pp);
2705                 if (nr < 0)
2706                         continue;
2707
2708                 stype = blkid_partition_get_type_string(pp);
2709                 if (!stype)
2710                         continue;
2711
2712                 if (sd_id128_from_string(stype, &type_id) < 0)
2713                         continue;
2714
2715                 if (sd_id128_equal(type_id, GPT_HOME)) {
2716
2717                         if (home && nr >= home_nr)
2718                                 continue;
2719
2720                         home_nr = nr;
2721                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2722
2723                         free(home);
2724                         home = strdup(node);
2725                         if (!home)
2726                                 return log_oom();
2727                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2728
2729                         if (srv && nr >= srv_nr)
2730                                 continue;
2731
2732                         srv_nr = nr;
2733                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2734
2735                         free(srv);
2736                         srv = strdup(node);
2737                         if (!srv)
2738                                 return log_oom();
2739                 }
2740 #ifdef GPT_ROOT_NATIVE
2741                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2742
2743                         if (root && nr >= root_nr)
2744                                 continue;
2745
2746                         root_nr = nr;
2747                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2748
2749                         free(root);
2750                         root = strdup(node);
2751                         if (!root)
2752                                 return log_oom();
2753                 }
2754 #endif
2755 #ifdef GPT_ROOT_SECONDARY
2756                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2757
2758                         if (secondary_root && nr >= secondary_root_nr)
2759                                 continue;
2760
2761                         secondary_root_nr = nr;
2762                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2763
2764
2765                         free(secondary_root);
2766                         secondary_root = strdup(node);
2767                         if (!secondary_root)
2768                                 return log_oom();
2769                 }
2770 #endif
2771         }
2772
2773         if (!root && !secondary_root) {
2774                 log_error("Failed to identify root partition in disk image %s.\n"
2775                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2776                 return -EINVAL;
2777         }
2778
2779         if (root) {
2780                 *root_device = root;
2781                 root = NULL;
2782
2783                 *root_device_rw = root_rw;
2784                 *secondary = false;
2785         } else if (secondary_root) {
2786                 *root_device = secondary_root;
2787                 secondary_root = NULL;
2788
2789                 *root_device_rw = secondary_root_rw;
2790                 *secondary = true;
2791         }
2792
2793         if (home) {
2794                 *home_device = home;
2795                 home = NULL;
2796
2797                 *home_device_rw = home_rw;
2798         }
2799
2800         if (srv) {
2801                 *srv_device = srv;
2802                 srv = NULL;
2803
2804                 *srv_device_rw = srv_rw;
2805         }
2806
2807         return 0;
2808 #else
2809         log_error("--image= is not supported, compiled without blkid support.");
2810         return -ENOTSUP;
2811 #endif
2812 }
2813
2814 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2815 #ifdef HAVE_BLKID
2816         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2817         const char *fstype, *p;
2818         int r;
2819
2820         assert(what);
2821         assert(where);
2822
2823         if (arg_read_only)
2824                 rw = false;
2825
2826         if (directory)
2827                 p = strappenda(where, directory);
2828         else
2829                 p = where;
2830
2831         errno = 0;
2832         b = blkid_new_probe_from_filename(what);
2833         if (!b) {
2834                 if (errno == 0)
2835                         return log_oom();
2836                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2837                 return -errno;
2838         }
2839
2840         blkid_probe_enable_superblocks(b, 1);
2841         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2842
2843         errno = 0;
2844         r = blkid_do_safeprobe(b);
2845         if (r == -1 || r == 1) {
2846                 log_error("Cannot determine file system type of %s", what);
2847                 return -EINVAL;
2848         } else if (r != 0) {
2849                 if (errno == 0)
2850                         errno = EIO;
2851                 log_error_errno(errno, "Failed to probe %s: %m", what);
2852                 return -errno;
2853         }
2854
2855         errno = 0;
2856         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2857                 if (errno == 0)
2858                         errno = EINVAL;
2859                 log_error("Failed to determine file system type of %s", what);
2860                 return -errno;
2861         }
2862
2863         if (streq(fstype, "crypto_LUKS")) {
2864                 log_error("nspawn currently does not support LUKS disk images.");
2865                 return -ENOTSUP;
2866         }
2867
2868         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2869                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2870
2871         return 0;
2872 #else
2873         log_error("--image= is not supported, compiled without blkid support.");
2874         return -ENOTSUP;
2875 #endif
2876 }
2877
2878 static int mount_devices(
2879                 const char *where,
2880                 const char *root_device, bool root_device_rw,
2881                 const char *home_device, bool home_device_rw,
2882                 const char *srv_device, bool srv_device_rw) {
2883         int r;
2884
2885         assert(where);
2886
2887         if (root_device) {
2888                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2889                 if (r < 0)
2890                         return log_error_errno(r, "Failed to mount root directory: %m");
2891         }
2892
2893         if (home_device) {
2894                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2895                 if (r < 0)
2896                         return log_error_errno(r, "Failed to mount home directory: %m");
2897         }
2898
2899         if (srv_device) {
2900                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2901                 if (r < 0)
2902                         return log_error_errno(r, "Failed to mount server data directory: %m");
2903         }
2904
2905         return 0;
2906 }
2907
2908 static void loop_remove(int nr, int *image_fd) {
2909         _cleanup_close_ int control = -1;
2910         int r;
2911
2912         if (nr < 0)
2913                 return;
2914
2915         if (image_fd && *image_fd >= 0) {
2916                 r = ioctl(*image_fd, LOOP_CLR_FD);
2917                 if (r < 0)
2918                         log_warning_errno(errno, "Failed to close loop image: %m");
2919                 *image_fd = safe_close(*image_fd);
2920         }
2921
2922         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2923         if (control < 0) {
2924                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2925                 return;
2926         }
2927
2928         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2929         if (r < 0)
2930                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2931 }
2932
2933 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2934         int pipe_fds[2];
2935         pid_t pid;
2936
2937         assert(database);
2938         assert(key);
2939         assert(rpid);
2940
2941         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2942                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2943
2944         pid = fork();
2945         if (pid < 0)
2946                 return log_error_errno(errno, "Failed to fork getent child: %m");
2947         else if (pid == 0) {
2948                 int nullfd;
2949                 char *empty_env = NULL;
2950
2951                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2952                         _exit(EXIT_FAILURE);
2953
2954                 if (pipe_fds[0] > 2)
2955                         safe_close(pipe_fds[0]);
2956                 if (pipe_fds[1] > 2)
2957                         safe_close(pipe_fds[1]);
2958
2959                 nullfd = open("/dev/null", O_RDWR);
2960                 if (nullfd < 0)
2961                         _exit(EXIT_FAILURE);
2962
2963                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2964                         _exit(EXIT_FAILURE);
2965
2966                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2967                         _exit(EXIT_FAILURE);
2968
2969                 if (nullfd > 2)
2970                         safe_close(nullfd);
2971
2972                 reset_all_signal_handlers();
2973                 close_all_fds(NULL, 0);
2974
2975                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2976                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2977                 _exit(EXIT_FAILURE);
2978         }
2979
2980         pipe_fds[1] = safe_close(pipe_fds[1]);
2981
2982         *rpid = pid;
2983
2984         return pipe_fds[0];
2985 }
2986
2987 static int change_uid_gid(char **_home) {
2988         char line[LINE_MAX], *x, *u, *g, *h;
2989         const char *word, *state;
2990         _cleanup_free_ uid_t *uids = NULL;
2991         _cleanup_free_ char *home = NULL;
2992         _cleanup_fclose_ FILE *f = NULL;
2993         _cleanup_close_ int fd = -1;
2994         unsigned n_uids = 0;
2995         size_t sz = 0, l;
2996         uid_t uid;
2997         gid_t gid;
2998         pid_t pid;
2999         int r;
3000
3001         assert(_home);
3002
3003         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3004                 /* Reset everything fully to 0, just in case */
3005
3006                 if (setgroups(0, NULL) < 0)
3007                         return log_error_errno(errno, "setgroups() failed: %m");
3008
3009                 if (setresgid(0, 0, 0) < 0)
3010                         return log_error_errno(errno, "setregid() failed: %m");
3011
3012                 if (setresuid(0, 0, 0) < 0)
3013                         return log_error_errno(errno, "setreuid() failed: %m");
3014
3015                 *_home = NULL;
3016                 return 0;
3017         }
3018
3019         /* First, get user credentials */
3020         fd = spawn_getent("passwd", arg_user, &pid);
3021         if (fd < 0)
3022                 return fd;
3023
3024         f = fdopen(fd, "r");
3025         if (!f)
3026                 return log_oom();
3027         fd = -1;
3028
3029         if (!fgets(line, sizeof(line), f)) {
3030
3031                 if (!ferror(f)) {
3032                         log_error("Failed to resolve user %s.", arg_user);
3033                         return -ESRCH;
3034                 }
3035
3036                 log_error_errno(errno, "Failed to read from getent: %m");
3037                 return -errno;
3038         }
3039
3040         truncate_nl(line);
3041
3042         wait_for_terminate_and_warn("getent passwd", pid, true);
3043
3044         x = strchr(line, ':');
3045         if (!x) {
3046                 log_error("/etc/passwd entry has invalid user field.");
3047                 return -EIO;
3048         }
3049
3050         u = strchr(x+1, ':');
3051         if (!u) {
3052                 log_error("/etc/passwd entry has invalid password field.");
3053                 return -EIO;
3054         }
3055
3056         u++;
3057         g = strchr(u, ':');
3058         if (!g) {
3059                 log_error("/etc/passwd entry has invalid UID field.");
3060                 return -EIO;
3061         }
3062
3063         *g = 0;
3064         g++;
3065         x = strchr(g, ':');
3066         if (!x) {
3067                 log_error("/etc/passwd entry has invalid GID field.");
3068                 return -EIO;
3069         }
3070
3071         *x = 0;
3072         h = strchr(x+1, ':');
3073         if (!h) {
3074                 log_error("/etc/passwd entry has invalid GECOS field.");
3075                 return -EIO;
3076         }
3077
3078         h++;
3079         x = strchr(h, ':');
3080         if (!x) {
3081                 log_error("/etc/passwd entry has invalid home directory field.");
3082                 return -EIO;
3083         }
3084
3085         *x = 0;
3086
3087         r = parse_uid(u, &uid);
3088         if (r < 0) {
3089                 log_error("Failed to parse UID of user.");
3090                 return -EIO;
3091         }
3092
3093         r = parse_gid(g, &gid);
3094         if (r < 0) {
3095                 log_error("Failed to parse GID of user.");
3096                 return -EIO;
3097         }
3098
3099         home = strdup(h);
3100         if (!home)
3101                 return log_oom();
3102
3103         /* Second, get group memberships */
3104         fd = spawn_getent("initgroups", arg_user, &pid);
3105         if (fd < 0)
3106                 return fd;
3107
3108         fclose(f);
3109         f = fdopen(fd, "r");
3110         if (!f)
3111                 return log_oom();
3112         fd = -1;
3113
3114         if (!fgets(line, sizeof(line), f)) {
3115                 if (!ferror(f)) {
3116                         log_error("Failed to resolve user %s.", arg_user);
3117                         return -ESRCH;
3118                 }
3119
3120                 log_error_errno(errno, "Failed to read from getent: %m");
3121                 return -errno;
3122         }
3123
3124         truncate_nl(line);
3125
3126         wait_for_terminate_and_warn("getent initgroups", pid, true);
3127
3128         /* Skip over the username and subsequent separator whitespace */
3129         x = line;
3130         x += strcspn(x, WHITESPACE);
3131         x += strspn(x, WHITESPACE);
3132
3133         FOREACH_WORD(word, l, x, state) {
3134                 char c[l+1];
3135
3136                 memcpy(c, word, l);
3137                 c[l] = 0;
3138
3139                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3140                         return log_oom();
3141
3142                 r = parse_uid(c, &uids[n_uids++]);
3143                 if (r < 0) {
3144                         log_error("Failed to parse group data from getent.");
3145                         return -EIO;
3146                 }
3147         }
3148
3149         r = mkdir_parents(home, 0775);
3150         if (r < 0)
3151                 return log_error_errno(r, "Failed to make home root directory: %m");
3152
3153         r = mkdir_safe(home, 0755, uid, gid);
3154         if (r < 0 && r != -EEXIST)
3155                 return log_error_errno(r, "Failed to make home directory: %m");
3156
3157         fchown(STDIN_FILENO, uid, gid);
3158         fchown(STDOUT_FILENO, uid, gid);
3159         fchown(STDERR_FILENO, uid, gid);
3160
3161         if (setgroups(n_uids, uids) < 0)
3162                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3163
3164         if (setresgid(gid, gid, gid) < 0)
3165                 return log_error_errno(errno, "setregid() failed: %m");
3166
3167         if (setresuid(uid, uid, uid) < 0)
3168                 return log_error_errno(errno, "setreuid() failed: %m");
3169
3170         if (_home) {
3171                 *_home = home;
3172                 home = NULL;
3173         }
3174
3175         return 0;
3176 }
3177
3178 /*
3179  * Return values:
3180  * < 0 : wait_for_terminate() failed to get the state of the
3181  *       container, the container was terminated by a signal, or
3182  *       failed for an unknown reason.  No change is made to the
3183  *       container argument.
3184  * > 0 : The program executed in the container terminated with an
3185  *       error.  The exit code of the program executed in the
3186  *       container is returned.  The container argument has been set
3187  *       to CONTAINER_TERMINATED.
3188  *   0 : The container is being rebooted, has been shut down or exited
3189  *       successfully.  The container argument has been set to either
3190  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3191  *
3192  * That is, success is indicated by a return value of zero, and an
3193  * error is indicated by a non-zero value.
3194  */
3195 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3196         siginfo_t status;
3197         int r;
3198
3199         r = wait_for_terminate(pid, &status);
3200         if (r < 0)
3201                 return log_warning_errno(r, "Failed to wait for container: %m");
3202
3203         switch (status.si_code) {
3204
3205         case CLD_EXITED:
3206                 if (status.si_status == 0) {
3207                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3208
3209                 } else
3210                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3211
3212                 *container = CONTAINER_TERMINATED;
3213                 return status.si_status;
3214
3215         case CLD_KILLED:
3216                 if (status.si_status == SIGINT) {
3217
3218                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3219                         *container = CONTAINER_TERMINATED;
3220                         return 0;
3221
3222                 } else if (status.si_status == SIGHUP) {
3223
3224                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3225                         *container = CONTAINER_REBOOTED;
3226                         return 0;
3227                 }
3228
3229                 /* CLD_KILLED fallthrough */
3230
3231         case CLD_DUMPED:
3232                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3233                 return -EIO;
3234
3235         default:
3236                 log_error("Container %s failed due to unknown reason.", arg_machine);
3237                 return -EIO;
3238         }
3239
3240         return r;
3241 }
3242
3243 static void nop_handler(int sig) {}
3244
3245 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3246         pid_t pid;
3247
3248         pid = PTR_TO_UINT32(userdata);
3249         if (pid > 0) {
3250                 if (kill(pid, SIGRTMIN+3) >= 0) {
3251                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3252                         sd_event_source_set_userdata(s, NULL);
3253                         return 0;
3254                 }
3255         }
3256
3257         sd_event_exit(sd_event_source_get_event(s), 0);
3258         return 0;
3259 }
3260
3261 static int determine_names(void) {
3262         int r;
3263
3264         if (!arg_image && !arg_directory) {
3265                 if (arg_machine) {
3266                         _cleanup_(image_unrefp) Image *i = NULL;
3267
3268                         r = image_find(arg_machine, &i);
3269                         if (r < 0)
3270                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3271                         else if (r == 0) {
3272                                 log_error("No image for machine '%s': %m", arg_machine);
3273                                 return -ENOENT;
3274                         }
3275
3276                         if (i->type == IMAGE_GPT)
3277                                 r = set_sanitized_path(&arg_image, i->path);
3278                         else
3279                                 r = set_sanitized_path(&arg_directory, i->path);
3280                         if (r < 0)
3281                                 return log_error_errno(r, "Invalid image directory: %m");
3282
3283                         arg_read_only = arg_read_only || i->read_only;
3284                 } else
3285                         arg_directory = get_current_dir_name();
3286
3287                 if (!arg_directory && !arg_machine) {
3288                         log_error("Failed to determine path, please use -D or -i.");
3289                         return -EINVAL;
3290                 }
3291         }
3292
3293         if (!arg_machine) {
3294                 if (arg_directory && path_equal(arg_directory, "/"))
3295                         arg_machine = gethostname_malloc();
3296                 else
3297                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3298
3299                 if (!arg_machine)
3300                         return log_oom();
3301
3302                 hostname_cleanup(arg_machine, false);
3303                 if (!machine_name_is_valid(arg_machine)) {
3304                         log_error("Failed to determine machine name automatically, please use -M.");
3305                         return -EINVAL;
3306                 }
3307
3308                 if (arg_ephemeral) {
3309                         char *b;
3310
3311                         /* Add a random suffix when this is an
3312                          * ephemeral machine, so that we can run many
3313                          * instances at once without manually having
3314                          * to specify -M each time. */
3315
3316                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3317                                 return log_oom();
3318
3319                         free(arg_machine);
3320                         arg_machine = b;
3321                 }
3322         }
3323
3324         return 0;
3325 }
3326
3327 int main(int argc, char *argv[]) {
3328
3329         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3330         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3331         _cleanup_close_ int master = -1, image_fd = -1;
3332         _cleanup_fdset_free_ FDSet *fds = NULL;
3333         int r, n_fd_passed, loop_nr = -1;
3334         char veth_name[IFNAMSIZ];
3335         bool secondary = false, remove_subvol = false;
3336         sigset_t mask, mask_chld;
3337         pid_t pid = 0;
3338         int ret = EXIT_SUCCESS;
3339         union in_addr_union exposed = {};
3340
3341         log_parse_environment();
3342         log_open();
3343
3344         r = parse_argv(argc, argv);
3345         if (r <= 0)
3346                 goto finish;
3347
3348         r = determine_names();
3349         if (r < 0)
3350                 goto finish;
3351
3352         if (geteuid() != 0) {
3353                 log_error("Need to be root.");
3354                 r = -EPERM;
3355                 goto finish;
3356         }
3357
3358         if (sd_booted() <= 0) {
3359                 log_error("Not running on a systemd system.");
3360                 r = -EINVAL;
3361                 goto finish;
3362         }
3363
3364         log_close();
3365         n_fd_passed = sd_listen_fds(false);
3366         if (n_fd_passed > 0) {
3367                 r = fdset_new_listen_fds(&fds, false);
3368                 if (r < 0) {
3369                         log_error_errno(r, "Failed to collect file descriptors: %m");
3370                         goto finish;
3371                 }
3372         }
3373         fdset_close_others(fds);
3374         log_open();
3375
3376         if (arg_directory) {
3377                 assert(!arg_image);
3378
3379                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3380                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3381                         r = -EINVAL;
3382                         goto finish;
3383                 }
3384
3385                 if (arg_template) {
3386                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3387                         if (r == -EEXIST) {
3388                                 if (!arg_quiet)
3389                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3390                         } else if (r < 0) {
3391                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3392                                 goto finish;
3393                         } else {
3394                                 if (!arg_quiet)
3395                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
3396                         }
3397
3398                 } else if (arg_ephemeral) {
3399                         char *np;
3400
3401                         /* If the specified path is a mount point we
3402                          * generate the new snapshot immediately
3403                          * inside it under a random name. However if
3404                          * the specified is not a mount point we
3405                          * create the new snapshot in the parent
3406                          * directory, just next to it. */
3407                         r = path_is_mount_point(arg_directory, false);
3408                         if (r < 0) {
3409                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3410                                 goto finish;
3411                         }
3412                         if (r > 0)
3413                                 r = tempfn_random_child(arg_directory, &np);
3414                         else
3415                                 r = tempfn_random(arg_directory, &np);
3416                         if (r < 0) {
3417                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3418                                 goto finish;
3419                         }
3420
3421                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3422                         if (r < 0) {
3423                                 free(np);
3424                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3425                                 goto finish;
3426                         }
3427
3428                         free(arg_directory);
3429                         arg_directory = np;
3430
3431                         remove_subvol = true;
3432                 }
3433
3434                 if (arg_boot) {
3435                         if (path_is_os_tree(arg_directory) <= 0) {
3436                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3437                                 r = -EINVAL;
3438                                 goto finish;
3439                         }
3440                 } else {
3441                         const char *p;
3442
3443                         p = strappenda(arg_directory,
3444                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3445                         if (access(p, F_OK) < 0) {
3446                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3447                                 r = -EINVAL;
3448                                 goto finish;
3449                         }
3450                 }
3451
3452         } else {
3453                 char template[] = "/tmp/nspawn-root-XXXXXX";
3454
3455                 assert(arg_image);
3456                 assert(!arg_template);
3457
3458                 if (!mkdtemp(template)) {
3459                         log_error_errno(errno, "Failed to create temporary directory: %m");
3460                         r = -errno;
3461                         goto finish;
3462                 }
3463
3464                 arg_directory = strdup(template);
3465                 if (!arg_directory) {
3466                         r = log_oom();
3467                         goto finish;
3468                 }
3469
3470                 image_fd = setup_image(&device_path, &loop_nr);
3471                 if (image_fd < 0) {
3472                         r = image_fd;
3473                         goto finish;
3474                 }
3475
3476                 r = dissect_image(image_fd,
3477                                   &root_device, &root_device_rw,
3478                                   &home_device, &home_device_rw,
3479                                   &srv_device, &srv_device_rw,
3480                                   &secondary);
3481                 if (r < 0)
3482                         goto finish;
3483         }
3484
3485         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3486         if (master < 0) {
3487                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3488                 goto finish;
3489         }
3490
3491         r = ptsname_malloc(master, &console);
3492         if (r < 0) {
3493                 r = log_error_errno(r, "Failed to determine tty name: %m");
3494                 goto finish;
3495         }
3496
3497         if (!arg_quiet)
3498                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3499                          arg_machine, arg_image ?: arg_directory);
3500
3501         if (unlockpt(master) < 0) {
3502                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3503                 goto finish;
3504         }
3505
3506         assert_se(sigemptyset(&mask) == 0);
3507         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3508         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3509
3510         assert_se(sigemptyset(&mask_chld) == 0);
3511         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3512
3513         for (;;) {
3514                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3515                 ContainerStatus container_status;
3516                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3517                 struct sigaction sa = {
3518                         .sa_handler = nop_handler,
3519                         .sa_flags = SA_NOCLDSTOP,
3520                 };
3521
3522                 r = barrier_create(&barrier);
3523                 if (r < 0) {
3524                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3525                         goto finish;
3526                 }
3527
3528                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3529                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3530                         goto finish;
3531                 }
3532
3533                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3534                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3535                         goto finish;
3536                 }
3537
3538                 /* Child can be killed before execv(), so handle SIGCHLD
3539                  * in order to interrupt parent's blocking calls and
3540                  * give it a chance to call wait() and terminate. */
3541                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3542                 if (r < 0) {
3543                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3544                         goto finish;
3545                 }
3546
3547                 r = sigaction(SIGCHLD, &sa, NULL);
3548                 if (r < 0) {
3549                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3550                         goto finish;
3551                 }
3552
3553                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3554                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3555                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3556                 if (pid < 0) {
3557                         if (errno == EINVAL)
3558                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3559                         else
3560                                 r = log_error_errno(errno, "clone() failed: %m");
3561
3562                         goto finish;
3563                 }
3564
3565                 if (pid == 0) {
3566                         /* child */
3567                         _cleanup_free_ char *home = NULL;
3568                         unsigned n_env = 2;
3569                         const char *envp[] = {
3570                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3571                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3572                                 NULL, /* TERM */
3573                                 NULL, /* HOME */
3574                                 NULL, /* USER */
3575                                 NULL, /* LOGNAME */
3576                                 NULL, /* container_uuid */
3577                                 NULL, /* LISTEN_FDS */
3578                                 NULL, /* LISTEN_PID */
3579                                 NULL
3580                         };
3581                         char **env_use;
3582
3583                         barrier_set_role(&barrier, BARRIER_CHILD);
3584
3585                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3586                         if (envp[n_env])
3587                                 n_env ++;
3588
3589                         master = safe_close(master);
3590
3591                         close_nointr(STDIN_FILENO);
3592                         close_nointr(STDOUT_FILENO);
3593                         close_nointr(STDERR_FILENO);
3594
3595                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3596                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3597
3598                         reset_all_signal_handlers();
3599                         reset_signal_mask();
3600
3601                         r = open_terminal(console, O_RDWR);
3602                         if (r != STDIN_FILENO) {
3603                                 if (r >= 0) {
3604                                         safe_close(r);
3605                                         r = -EINVAL;
3606                                 }
3607
3608                                 log_error_errno(r, "Failed to open console: %m");
3609                                 _exit(EXIT_FAILURE);
3610                         }
3611
3612                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3613                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3614                                 log_error_errno(errno, "Failed to duplicate console: %m");
3615                                 _exit(EXIT_FAILURE);
3616                         }
3617
3618                         if (setsid() < 0) {
3619                                 log_error_errno(errno, "setsid() failed: %m");
3620                                 _exit(EXIT_FAILURE);
3621                         }
3622
3623                         if (reset_audit_loginuid() < 0)
3624                                 _exit(EXIT_FAILURE);
3625
3626                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3627                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3628                                 _exit(EXIT_FAILURE);
3629                         }
3630
3631                         /* Mark everything as slave, so that we still
3632                          * receive mounts from the real root, but don't
3633                          * propagate mounts to the real root. */
3634                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3635                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3636                                 _exit(EXIT_FAILURE);
3637                         }
3638
3639                         if (mount_devices(arg_directory,
3640                                           root_device, root_device_rw,
3641                                           home_device, home_device_rw,
3642                                           srv_device, srv_device_rw) < 0)
3643                                 _exit(EXIT_FAILURE);
3644
3645                         /* Turn directory into bind mount */
3646                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3647                                 log_error_errno(errno, "Failed to make bind mount: %m");
3648                                 _exit(EXIT_FAILURE);
3649                         }
3650
3651                         r = setup_volatile(arg_directory);
3652                         if (r < 0)
3653                                 _exit(EXIT_FAILURE);
3654
3655                         if (setup_volatile_state(arg_directory) < 0)
3656                                 _exit(EXIT_FAILURE);
3657
3658                         r = base_filesystem_create(arg_directory);
3659                         if (r < 0)
3660                                 _exit(EXIT_FAILURE);
3661
3662                         if (arg_read_only) {
3663                                 r = bind_remount_recursive(arg_directory, true);
3664                                 if (r < 0) {
3665                                         log_error_errno(r, "Failed to make tree read-only: %m");
3666                                         _exit(EXIT_FAILURE);
3667                                 }
3668                         }
3669
3670                         if (mount_all(arg_directory) < 0)
3671                                 _exit(EXIT_FAILURE);
3672
3673                         if (copy_devnodes(arg_directory) < 0)
3674                                 _exit(EXIT_FAILURE);
3675
3676                         if (setup_ptmx(arg_directory) < 0)
3677                                 _exit(EXIT_FAILURE);
3678
3679                         dev_setup(arg_directory);
3680
3681                         if (setup_propagate(arg_directory) < 0)
3682                                 _exit(EXIT_FAILURE);
3683
3684                         if (setup_seccomp() < 0)
3685                                 _exit(EXIT_FAILURE);
3686
3687                         if (setup_dev_console(arg_directory, console) < 0)
3688                                 _exit(EXIT_FAILURE);
3689
3690                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3691                                 _exit(EXIT_FAILURE);
3692                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3693
3694                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3695                                 _exit(EXIT_FAILURE);
3696                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3697
3698                         /* Tell the parent that we are ready, and that
3699                          * it can cgroupify us to that we lack access
3700                          * to certain devices and resources. */
3701                         (void) barrier_place(&barrier);
3702
3703                         if (setup_boot_id(arg_directory) < 0)
3704                                 _exit(EXIT_FAILURE);
3705
3706                         if (setup_timezone(arg_directory) < 0)
3707                                 _exit(EXIT_FAILURE);
3708
3709                         if (setup_resolv_conf(arg_directory) < 0)
3710                                 _exit(EXIT_FAILURE);
3711
3712                         if (setup_journal(arg_directory) < 0)
3713                                 _exit(EXIT_FAILURE);
3714
3715                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3716                                 _exit(EXIT_FAILURE);
3717
3718                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3719                                 _exit(EXIT_FAILURE);
3720
3721                         if (mount_tmpfs(arg_directory) < 0)
3722                                 _exit(EXIT_FAILURE);
3723
3724                         /* Wait until we are cgroup-ified, so that we
3725                          * can mount the right cgroup path writable */
3726                         (void) barrier_sync_next(&barrier);
3727
3728                         if (mount_cgroup(arg_directory) < 0)
3729                                 _exit(EXIT_FAILURE);
3730
3731                         if (chdir(arg_directory) < 0) {
3732                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3733                                 _exit(EXIT_FAILURE);
3734                         }
3735
3736                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3737                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3738                                 _exit(EXIT_FAILURE);
3739                         }
3740
3741                         if (chroot(".") < 0) {
3742                                 log_error_errno(errno, "chroot() failed: %m");
3743                                 _exit(EXIT_FAILURE);
3744                         }
3745
3746                         if (chdir("/") < 0) {
3747                                 log_error_errno(errno, "chdir() failed: %m");
3748                                 _exit(EXIT_FAILURE);
3749                         }
3750
3751                         umask(0022);
3752
3753                         if (arg_private_network)
3754                                 loopback_setup();
3755
3756                         if (drop_capabilities() < 0) {
3757                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3758                                 _exit(EXIT_FAILURE);
3759                         }
3760
3761                         r = change_uid_gid(&home);
3762                         if (r < 0)
3763                                 _exit(EXIT_FAILURE);
3764
3765                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3766                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3767                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3768                                 log_oom();
3769                                 _exit(EXIT_FAILURE);
3770                         }
3771
3772                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3773                                 char as_uuid[37];
3774
3775                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3776                                         log_oom();
3777                                         _exit(EXIT_FAILURE);
3778                                 }
3779                         }
3780
3781                         if (fdset_size(fds) > 0) {
3782                                 r = fdset_cloexec(fds, false);
3783                                 if (r < 0) {
3784                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3785                                         _exit(EXIT_FAILURE);
3786                                 }
3787
3788                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3789                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3790                                         log_oom();
3791                                         _exit(EXIT_FAILURE);
3792                                 }
3793                         }
3794
3795                         setup_hostname();
3796
3797                         if (arg_personality != 0xffffffffLU) {
3798                                 if (personality(arg_personality) < 0) {
3799                                         log_error_errno(errno, "personality() failed: %m");
3800                                         _exit(EXIT_FAILURE);
3801                                 }
3802                         } else if (secondary) {
3803                                 if (personality(PER_LINUX32) < 0) {
3804                                         log_error_errno(errno, "personality() failed: %m");
3805                                         _exit(EXIT_FAILURE);
3806                                 }
3807                         }
3808
3809 #ifdef HAVE_SELINUX
3810                         if (arg_selinux_context)
3811                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3812                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3813                                         _exit(EXIT_FAILURE);
3814                                 }
3815 #endif
3816
3817                         if (!strv_isempty(arg_setenv)) {
3818                                 char **n;
3819
3820                                 n = strv_env_merge(2, envp, arg_setenv);
3821                                 if (!n) {
3822                                         log_oom();
3823                                         _exit(EXIT_FAILURE);
3824                                 }
3825
3826                                 env_use = n;
3827                         } else
3828                                 env_use = (char**) envp;
3829
3830                         /* Wait until the parent is ready with the setup, too... */
3831                         if (!barrier_place_and_sync(&barrier))
3832                                 _exit(EXIT_FAILURE);
3833
3834                         if (arg_boot) {
3835                                 char **a;
3836                                 size_t l;
3837
3838                                 /* Automatically search for the init system */
3839
3840                                 l = 1 + argc - optind;
3841                                 a = newa(char*, l + 1);
3842                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3843
3844                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3845                                 execve(a[0], a, env_use);
3846
3847                                 a[0] = (char*) "/lib/systemd/systemd";
3848                                 execve(a[0], a, env_use);
3849
3850                                 a[0] = (char*) "/sbin/init";
3851                                 execve(a[0], a, env_use);
3852                         } else if (argc > optind)
3853                                 execvpe(argv[optind], argv + optind, env_use);
3854                         else {
3855                                 chdir(home ? home : "/root");
3856                                 execle("/bin/bash", "-bash", NULL, env_use);
3857                                 execle("/bin/sh", "-sh", NULL, env_use);
3858                         }
3859
3860                         log_error_errno(errno, "execv() failed: %m");
3861                         _exit(EXIT_FAILURE);
3862                 }
3863
3864                 barrier_set_role(&barrier, BARRIER_PARENT);
3865                 fdset_free(fds);
3866                 fds = NULL;
3867
3868                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3869                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3870
3871                 /* Wait for the most basic Child-setup to be done,
3872                  * before we add hardware to it, and place it in a
3873                  * cgroup. */
3874                 if (barrier_sync_next(&barrier)) {
3875                         int ifi = 0;
3876
3877                         r = move_network_interfaces(pid);
3878                         if (r < 0)
3879                                 goto finish;
3880
3881                         r = setup_veth(pid, veth_name, &ifi);
3882                         if (r < 0)
3883                                 goto finish;
3884
3885                         r = setup_bridge(veth_name, &ifi);
3886                         if (r < 0)
3887                                 goto finish;
3888
3889                         r = setup_macvlan(pid);
3890                         if (r < 0)
3891                                 goto finish;
3892
3893                         r = register_machine(pid, ifi);
3894                         if (r < 0)
3895                                 goto finish;
3896
3897                         /* Block SIGCHLD here, before notifying child.
3898                          * process_pty() will handle it with the other signals. */
3899                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3900                         if (r < 0)
3901                                 goto finish;
3902
3903                         /* Reset signal to default */
3904                         r = default_signals(SIGCHLD, -1);
3905                         if (r < 0)
3906                                 goto finish;
3907
3908                         /* Notify the child that the parent is ready with all
3909                          * its setup, and that the child can now hand over
3910                          * control to the code to run inside the container. */
3911                         (void) barrier_place(&barrier);
3912
3913                         /* And wait that the child is completely ready now. */
3914                         if (barrier_place_and_sync(&barrier)) {
3915                                 _cleanup_event_unref_ sd_event *event = NULL;
3916                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3917                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
3918                                 char last_char = 0;
3919
3920                                 sd_notify(false,
3921                                           "READY=1\n"
3922                                           "STATUS=Container running.");
3923
3924                                 r = sd_event_new(&event);
3925                                 if (r < 0) {
3926                                         log_error_errno(r, "Failed to get default event source: %m");
3927                                         goto finish;
3928                                 }
3929
3930                                 if (arg_boot) {
3931                                         /* Try to kill the init system on SIGINT or SIGTERM */
3932                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3933                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3934                                 } else {
3935                                         /* Immediately exit */
3936                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3937                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3938                                 }
3939
3940                                 /* simply exit on sigchld */
3941                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3942
3943                                 if (arg_expose_ports) {
3944                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
3945                                         if (r < 0)
3946                                                 goto finish;
3947
3948                                         (void) expose_ports(rtnl, &exposed);
3949                                 }
3950
3951                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3952
3953                                 r = pty_forward_new(event, master, true, &forward);
3954                                 if (r < 0) {
3955                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
3956                                         goto finish;
3957                                 }
3958
3959                                 r = sd_event_loop(event);
3960                                 if (r < 0) {
3961                                         log_error_errno(r, "Failed to run event loop: %m");
3962                                         goto finish;
3963                                 }
3964
3965                                 pty_forward_get_last_char(forward, &last_char);
3966
3967                                 forward = pty_forward_free(forward);
3968
3969                                 if (!arg_quiet && last_char != '\n')
3970                                         putc('\n', stdout);
3971
3972                                 /* Kill if it is not dead yet anyway */
3973                                 terminate_machine(pid);
3974                         }
3975                 }
3976
3977                 /* Normally redundant, but better safe than sorry */
3978                 kill(pid, SIGKILL);
3979
3980                 r = wait_for_container(pid, &container_status);
3981                 pid = 0;
3982
3983                 if (r < 0)
3984                         /* We failed to wait for the container, or the
3985                          * container exited abnormally */
3986                         goto finish;
3987                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3988                         /* The container exited with a non-zero
3989                          * status, or with zero status and no reboot
3990                          * was requested. */
3991                         ret = r;
3992                         break;
3993                 }
3994
3995                 /* CONTAINER_REBOOTED, loop again */
3996
3997                 if (arg_keep_unit) {
3998                         /* Special handling if we are running as a
3999                          * service: instead of simply restarting the
4000                          * machine we want to restart the entire
4001                          * service, so let's inform systemd about this
4002                          * with the special exit code 133. The service
4003                          * file uses RestartForceExitStatus=133 so
4004                          * that this results in a full nspawn
4005                          * restart. This is necessary since we might
4006                          * have cgroup parameters set we want to have
4007                          * flushed out. */
4008                         ret = 133;
4009                         r = 0;
4010                         break;
4011                 }
4012
4013                 flush_ports(&exposed);
4014         }
4015
4016 finish:
4017         sd_notify(false,
4018                   "STOPPING=1\n"
4019                   "STATUS=Terminating...");
4020
4021         loop_remove(loop_nr, &image_fd);
4022
4023         if (pid > 0)
4024                 kill(pid, SIGKILL);
4025
4026         if (remove_subvol && arg_directory) {
4027                 int k;
4028
4029                 k = btrfs_subvol_remove(arg_directory);
4030                 if (k < 0)
4031                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4032         }
4033
4034         if (arg_machine) {
4035                 const char *p;
4036
4037                 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
4038                 (void) rm_rf(p, false, true, false);
4039         }
4040
4041         free(arg_directory);
4042         free(arg_template);
4043         free(arg_image);
4044         free(arg_machine);
4045         free(arg_user);
4046         strv_free(arg_setenv);
4047         strv_free(arg_network_interfaces);
4048         strv_free(arg_network_macvlan);
4049         strv_free(arg_bind);
4050         strv_free(arg_bind_ro);
4051         strv_free(arg_tmpfs);
4052
4053         flush_ports(&exposed);
4054
4055         while (arg_expose_ports) {
4056                 ExposePort *p = arg_expose_ports;
4057                 LIST_REMOVE(ports, arg_expose_ports, p);
4058                 free(p);
4059         }
4060
4061         return r < 0 ? EXIT_FAILURE : ret;
4062 }