chiark / gitweb /
232629d20ad80c9e4b95f894edf730bf5017ebff
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46 #include <poll.h>
47 #include <sys/file.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89 #include "siphash24.h"
90 #include "copy.h"
91 #include "base-filesystem.h"
92 #include "barrier.h"
93 #include "event-util.h"
94 #include "capability.h"
95 #include "cap-list.h"
96 #include "btrfs-util.h"
97 #include "machine-image.h"
98 #include "list.h"
99 #include "in-addr-util.h"
100 #include "fw-util.h"
101 #include "local-addresses.h"
102
103 #ifdef HAVE_SECCOMP
104 #include "seccomp-util.h"
105 #endif
106
107 typedef struct ExposePort {
108         int protocol;
109         uint16_t host_port;
110         uint16_t container_port;
111         LIST_FIELDS(struct ExposePort, ports);
112 } ExposePort;
113
114 typedef enum ContainerStatus {
115         CONTAINER_TERMINATED,
116         CONTAINER_REBOOTED
117 } ContainerStatus;
118
119 typedef enum LinkJournal {
120         LINK_NO,
121         LINK_AUTO,
122         LINK_HOST,
123         LINK_GUEST
124 } LinkJournal;
125
126 typedef enum Volatile {
127         VOLATILE_NO,
128         VOLATILE_YES,
129         VOLATILE_STATE,
130 } Volatile;
131
132 static char *arg_directory = NULL;
133 static char *arg_template = NULL;
134 static char *arg_user = NULL;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;
137 static const char *arg_selinux_context = NULL;
138 static const char *arg_selinux_apifs_context = NULL;
139 static const char *arg_slice = NULL;
140 static bool arg_private_network = false;
141 static bool arg_read_only = false;
142 static bool arg_boot = false;
143 static bool arg_ephemeral = false;
144 static LinkJournal arg_link_journal = LINK_AUTO;
145 static bool arg_link_journal_try = false;
146 static uint64_t arg_retain =
147         (1ULL << CAP_CHOWN) |
148         (1ULL << CAP_DAC_OVERRIDE) |
149         (1ULL << CAP_DAC_READ_SEARCH) |
150         (1ULL << CAP_FOWNER) |
151         (1ULL << CAP_FSETID) |
152         (1ULL << CAP_IPC_OWNER) |
153         (1ULL << CAP_KILL) |
154         (1ULL << CAP_LEASE) |
155         (1ULL << CAP_LINUX_IMMUTABLE) |
156         (1ULL << CAP_NET_BIND_SERVICE) |
157         (1ULL << CAP_NET_BROADCAST) |
158         (1ULL << CAP_NET_RAW) |
159         (1ULL << CAP_SETGID) |
160         (1ULL << CAP_SETFCAP) |
161         (1ULL << CAP_SETPCAP) |
162         (1ULL << CAP_SETUID) |
163         (1ULL << CAP_SYS_ADMIN) |
164         (1ULL << CAP_SYS_CHROOT) |
165         (1ULL << CAP_SYS_NICE) |
166         (1ULL << CAP_SYS_PTRACE) |
167         (1ULL << CAP_SYS_TTY_CONFIG) |
168         (1ULL << CAP_SYS_RESOURCE) |
169         (1ULL << CAP_SYS_BOOT) |
170         (1ULL << CAP_AUDIT_WRITE) |
171         (1ULL << CAP_AUDIT_CONTROL) |
172         (1ULL << CAP_MKNOD);
173 static char **arg_bind = NULL;
174 static char **arg_bind_ro = NULL;
175 static char **arg_tmpfs = NULL;
176 static char **arg_setenv = NULL;
177 static bool arg_quiet = false;
178 static bool arg_share_system = false;
179 static bool arg_register = true;
180 static bool arg_keep_unit = false;
181 static char **arg_network_interfaces = NULL;
182 static char **arg_network_macvlan = NULL;
183 static char **arg_network_ipvlan = NULL;
184 static bool arg_network_veth = false;
185 static const char *arg_network_bridge = NULL;
186 static unsigned long arg_personality = 0xffffffffLU;
187 static char *arg_image = NULL;
188 static Volatile arg_volatile = VOLATILE_NO;
189 static ExposePort *arg_expose_ports = NULL;
190 static char **arg_property = NULL;
191
192 static void help(void) {
193         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
194                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
195                "  -h --help                 Show this help\n"
196                "     --version              Print version string\n"
197                "  -q --quiet                Do not show status information\n"
198                "  -D --directory=PATH       Root directory for the container\n"
199                "     --template=PATH        Initialize root directory from template directory,\n"
200                "                            if missing\n"
201                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
202                "                            remove it after exit\n"
203                "  -i --image=PATH           File system device or disk image for the container\n"
204                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
205                "  -u --user=USER            Run the command under specified user or uid\n"
206                "  -M --machine=NAME         Set the machine name for the container\n"
207                "     --uuid=UUID            Set a specific machine UUID for the container\n"
208                "  -S --slice=SLICE          Place the container in the specified slice\n"
209                "     --property=NAME=VALUE  Set scope unit property\n"
210                "     --private-network      Disable network in container\n"
211                "     --network-interface=INTERFACE\n"
212                "                            Assign an existing network interface to the\n"
213                "                            container\n"
214                "     --network-macvlan=INTERFACE\n"
215                "                            Create a macvlan network interface based on an\n"
216                "                            existing network interface to the container\n"
217                "     --network-ipvlan=INTERFACE\n"
218                "                            Create a ipvlan network interface based on an\n"
219                "                            existing network interface to the container\n"
220                "  -n --network-veth         Add a virtual ethernet connection between host\n"
221                "                            and container\n"
222                "     --network-bridge=INTERFACE\n"
223                "                            Add a virtual ethernet connection between host\n"
224                "                            and container and add it to an existing bridge on\n"
225                "                            the host\n"
226                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
227                "                            Expose a container IP port on the host\n"
228                "  -Z --selinux-context=SECLABEL\n"
229                "                            Set the SELinux security context to be used by\n"
230                "                            processes in the container\n"
231                "  -L --selinux-apifs-context=SECLABEL\n"
232                "                            Set the SELinux security context to be used by\n"
233                "                            API/tmpfs file systems in the container\n"
234                "     --capability=CAP       In addition to the default, retain specified\n"
235                "                            capability\n"
236                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
237                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
238                "                            try-guest, try-host\n"
239                "  -j                        Equivalent to --link-journal=try-guest\n"
240                "     --read-only            Mount the root directory read-only\n"
241                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
242                "                            the container\n"
243                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
244                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
245                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
246                "     --share-system         Share system namespaces with host\n"
247                "     --register=BOOLEAN     Register container as machine\n"
248                "     --keep-unit            Do not register a scope for the machine, reuse\n"
249                "                            the service unit nspawn is running in\n"
250                "     --volatile[=MODE]      Run the system in volatile mode\n"
251                , program_invocation_short_name);
252 }
253
254 static int set_sanitized_path(char **b, const char *path) {
255         char *p;
256
257         assert(b);
258         assert(path);
259
260         p = canonicalize_file_name(path);
261         if (!p) {
262                 if (errno != ENOENT)
263                         return -errno;
264
265                 p = path_make_absolute_cwd(path);
266                 if (!p)
267                         return -ENOMEM;
268         }
269
270         free(*b);
271         *b = path_kill_slashes(p);
272         return 0;
273 }
274
275 static int parse_argv(int argc, char *argv[]) {
276
277         enum {
278                 ARG_VERSION = 0x100,
279                 ARG_PRIVATE_NETWORK,
280                 ARG_UUID,
281                 ARG_READ_ONLY,
282                 ARG_CAPABILITY,
283                 ARG_DROP_CAPABILITY,
284                 ARG_LINK_JOURNAL,
285                 ARG_BIND,
286                 ARG_BIND_RO,
287                 ARG_TMPFS,
288                 ARG_SETENV,
289                 ARG_SHARE_SYSTEM,
290                 ARG_REGISTER,
291                 ARG_KEEP_UNIT,
292                 ARG_NETWORK_INTERFACE,
293                 ARG_NETWORK_MACVLAN,
294                 ARG_NETWORK_IPVLAN,
295                 ARG_NETWORK_BRIDGE,
296                 ARG_PERSONALITY,
297                 ARG_VOLATILE,
298                 ARG_TEMPLATE,
299                 ARG_PROPERTY,
300         };
301
302         static const struct option options[] = {
303                 { "help",                  no_argument,       NULL, 'h'                   },
304                 { "version",               no_argument,       NULL, ARG_VERSION           },
305                 { "directory",             required_argument, NULL, 'D'                   },
306                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
307                 { "ephemeral",             no_argument,       NULL, 'x'                   },
308                 { "user",                  required_argument, NULL, 'u'                   },
309                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
310                 { "boot",                  no_argument,       NULL, 'b'                   },
311                 { "uuid",                  required_argument, NULL, ARG_UUID              },
312                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
313                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
314                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
315                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
316                 { "bind",                  required_argument, NULL, ARG_BIND              },
317                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
318                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
319                 { "machine",               required_argument, NULL, 'M'                   },
320                 { "slice",                 required_argument, NULL, 'S'                   },
321                 { "setenv",                required_argument, NULL, ARG_SETENV            },
322                 { "selinux-context",       required_argument, NULL, 'Z'                   },
323                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
324                 { "quiet",                 no_argument,       NULL, 'q'                   },
325                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
326                 { "register",              required_argument, NULL, ARG_REGISTER          },
327                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
328                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
329                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
330                 { "network-ipvlan",        required_argument, NULL, ARG_NETWORK_IPVLAN    },
331                 { "network-veth",          no_argument,       NULL, 'n'                   },
332                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
333                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
334                 { "image",                 required_argument, NULL, 'i'                   },
335                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
336                 { "port",                  required_argument, NULL, 'p'                   },
337                 { "property",              required_argument, NULL, ARG_PROPERTY          },
338                 {}
339         };
340
341         int c, r;
342         uint64_t plus = 0, minus = 0;
343
344         assert(argc >= 0);
345         assert(argv);
346
347         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:xp:n", options, NULL)) >= 0)
348
349                 switch (c) {
350
351                 case 'h':
352                         help();
353                         return 0;
354
355                 case ARG_VERSION:
356                         puts(PACKAGE_STRING);
357                         puts(SYSTEMD_FEATURES);
358                         return 0;
359
360                 case 'D':
361                         r = set_sanitized_path(&arg_directory, optarg);
362                         if (r < 0)
363                                 return log_error_errno(r, "Invalid root directory: %m");
364
365                         break;
366
367                 case ARG_TEMPLATE:
368                         r = set_sanitized_path(&arg_template, optarg);
369                         if (r < 0)
370                                 return log_error_errno(r, "Invalid template directory: %m");
371
372                         break;
373
374                 case 'i':
375                         r = set_sanitized_path(&arg_image, optarg);
376                         if (r < 0)
377                                 return log_error_errno(r, "Invalid image path: %m");
378
379                         break;
380
381                 case 'x':
382                         arg_ephemeral = true;
383                         break;
384
385                 case 'u':
386                         free(arg_user);
387                         arg_user = strdup(optarg);
388                         if (!arg_user)
389                                 return log_oom();
390
391                         break;
392
393                 case ARG_NETWORK_BRIDGE:
394                         arg_network_bridge = optarg;
395
396                         /* fall through */
397
398                 case 'n':
399                         arg_network_veth = true;
400                         arg_private_network = true;
401                         break;
402
403                 case ARG_NETWORK_INTERFACE:
404                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
405                                 return log_oom();
406
407                         arg_private_network = true;
408                         break;
409
410                 case ARG_NETWORK_MACVLAN:
411                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
412                                 return log_oom();
413
414                         arg_private_network = true;
415                         break;
416
417                 case ARG_NETWORK_IPVLAN:
418                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
419                                 return log_oom();
420
421                         /* fall through */
422
423                 case ARG_PRIVATE_NETWORK:
424                         arg_private_network = true;
425                         break;
426
427                 case 'b':
428                         arg_boot = true;
429                         break;
430
431                 case ARG_UUID:
432                         r = sd_id128_from_string(optarg, &arg_uuid);
433                         if (r < 0) {
434                                 log_error("Invalid UUID: %s", optarg);
435                                 return r;
436                         }
437                         break;
438
439                 case 'S':
440                         arg_slice = optarg;
441                         break;
442
443                 case 'M':
444                         if (isempty(optarg)) {
445                                 free(arg_machine);
446                                 arg_machine = NULL;
447                         } else {
448                                 if (!machine_name_is_valid(optarg)) {
449                                         log_error("Invalid machine name: %s", optarg);
450                                         return -EINVAL;
451                                 }
452
453                                 r = free_and_strdup(&arg_machine, optarg);
454                                 if (r < 0)
455                                         return log_oom();
456
457                                 break;
458                         }
459
460                 case 'Z':
461                         arg_selinux_context = optarg;
462                         break;
463
464                 case 'L':
465                         arg_selinux_apifs_context = optarg;
466                         break;
467
468                 case ARG_READ_ONLY:
469                         arg_read_only = true;
470                         break;
471
472                 case ARG_CAPABILITY:
473                 case ARG_DROP_CAPABILITY: {
474                         const char *state, *word;
475                         size_t length;
476
477                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
478                                 _cleanup_free_ char *t;
479
480                                 t = strndup(word, length);
481                                 if (!t)
482                                         return log_oom();
483
484                                 if (streq(t, "all")) {
485                                         if (c == ARG_CAPABILITY)
486                                                 plus = (uint64_t) -1;
487                                         else
488                                                 minus = (uint64_t) -1;
489                                 } else {
490                                         int cap;
491
492                                         cap = capability_from_name(t);
493                                         if (cap < 0) {
494                                                 log_error("Failed to parse capability %s.", t);
495                                                 return -EINVAL;
496                                         }
497
498                                         if (c == ARG_CAPABILITY)
499                                                 plus |= 1ULL << (uint64_t) cap;
500                                         else
501                                                 minus |= 1ULL << (uint64_t) cap;
502                                 }
503                         }
504
505                         break;
506                 }
507
508                 case 'j':
509                         arg_link_journal = LINK_GUEST;
510                         arg_link_journal_try = true;
511                         break;
512
513                 case ARG_LINK_JOURNAL:
514                         if (streq(optarg, "auto")) {
515                                 arg_link_journal = LINK_AUTO;
516                                 arg_link_journal_try = false;
517                         } else if (streq(optarg, "no")) {
518                                 arg_link_journal = LINK_NO;
519                                 arg_link_journal_try = false;
520                         } else if (streq(optarg, "guest")) {
521                                 arg_link_journal = LINK_GUEST;
522                                 arg_link_journal_try = false;
523                         } else if (streq(optarg, "host")) {
524                                 arg_link_journal = LINK_HOST;
525                                 arg_link_journal_try = false;
526                         } else if (streq(optarg, "try-guest")) {
527                                 arg_link_journal = LINK_GUEST;
528                                 arg_link_journal_try = true;
529                         } else if (streq(optarg, "try-host")) {
530                                 arg_link_journal = LINK_HOST;
531                                 arg_link_journal_try = true;
532                         } else {
533                                 log_error("Failed to parse link journal mode %s", optarg);
534                                 return -EINVAL;
535                         }
536
537                         break;
538
539                 case ARG_BIND:
540                 case ARG_BIND_RO: {
541                         _cleanup_free_ char *a = NULL, *b = NULL;
542                         char *e;
543                         char ***x;
544
545                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
546
547                         e = strchr(optarg, ':');
548                         if (e) {
549                                 a = strndup(optarg, e - optarg);
550                                 b = strdup(e + 1);
551                         } else {
552                                 a = strdup(optarg);
553                                 b = strdup(optarg);
554                         }
555
556                         if (!a || !b)
557                                 return log_oom();
558
559                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
560                                 log_error("Invalid bind mount specification: %s", optarg);
561                                 return -EINVAL;
562                         }
563
564                         r = strv_extend(x, a);
565                         if (r < 0)
566                                 return log_oom();
567
568                         r = strv_extend(x, b);
569                         if (r < 0)
570                                 return log_oom();
571
572                         break;
573                 }
574
575                 case ARG_TMPFS: {
576                         _cleanup_free_ char *a = NULL, *b = NULL;
577                         char *e;
578
579                         e = strchr(optarg, ':');
580                         if (e) {
581                                 a = strndup(optarg, e - optarg);
582                                 b = strdup(e + 1);
583                         } else {
584                                 a = strdup(optarg);
585                                 b = strdup("mode=0755");
586                         }
587
588                         if (!a || !b)
589                                 return log_oom();
590
591                         if (!path_is_absolute(a)) {
592                                 log_error("Invalid tmpfs specification: %s", optarg);
593                                 return -EINVAL;
594                         }
595
596                         r = strv_push(&arg_tmpfs, a);
597                         if (r < 0)
598                                 return log_oom();
599
600                         a = NULL;
601
602                         r = strv_push(&arg_tmpfs, b);
603                         if (r < 0)
604                                 return log_oom();
605
606                         b = NULL;
607
608                         break;
609                 }
610
611                 case ARG_SETENV: {
612                         char **n;
613
614                         if (!env_assignment_is_valid(optarg)) {
615                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
616                                 return -EINVAL;
617                         }
618
619                         n = strv_env_set(arg_setenv, optarg);
620                         if (!n)
621                                 return log_oom();
622
623                         strv_free(arg_setenv);
624                         arg_setenv = n;
625                         break;
626                 }
627
628                 case 'q':
629                         arg_quiet = true;
630                         break;
631
632                 case ARG_SHARE_SYSTEM:
633                         arg_share_system = true;
634                         break;
635
636                 case ARG_REGISTER:
637                         r = parse_boolean(optarg);
638                         if (r < 0) {
639                                 log_error("Failed to parse --register= argument: %s", optarg);
640                                 return r;
641                         }
642
643                         arg_register = r;
644                         break;
645
646                 case ARG_KEEP_UNIT:
647                         arg_keep_unit = true;
648                         break;
649
650                 case ARG_PERSONALITY:
651
652                         arg_personality = personality_from_string(optarg);
653                         if (arg_personality == 0xffffffffLU) {
654                                 log_error("Unknown or unsupported personality '%s'.", optarg);
655                                 return -EINVAL;
656                         }
657
658                         break;
659
660                 case ARG_VOLATILE:
661
662                         if (!optarg)
663                                 arg_volatile = VOLATILE_YES;
664                         else {
665                                 r = parse_boolean(optarg);
666                                 if (r < 0) {
667                                         if (streq(optarg, "state"))
668                                                 arg_volatile = VOLATILE_STATE;
669                                         else {
670                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
671                                                 return r;
672                                         }
673                                 } else
674                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
675                         }
676
677                         break;
678
679                 case 'p': {
680                         const char *split, *e;
681                         uint16_t container_port, host_port;
682                         int protocol;
683                         ExposePort *p;
684
685                         if ((e = startswith(optarg, "tcp:")))
686                                 protocol = IPPROTO_TCP;
687                         else if ((e = startswith(optarg, "udp:")))
688                                 protocol = IPPROTO_UDP;
689                         else {
690                                 e = optarg;
691                                 protocol = IPPROTO_TCP;
692                         }
693
694                         split = strchr(e, ':');
695                         if (split) {
696                                 char v[split - e + 1];
697
698                                 memcpy(v, e, split - e);
699                                 v[split - e] = 0;
700
701                                 r = safe_atou16(v, &host_port);
702                                 if (r < 0 || host_port <= 0) {
703                                         log_error("Failed to parse host port: %s", optarg);
704                                         return -EINVAL;
705                                 }
706
707                                 r = safe_atou16(split + 1, &container_port);
708                         } else {
709                                 r = safe_atou16(e, &container_port);
710                                 host_port = container_port;
711                         }
712
713                         if (r < 0 || container_port <= 0) {
714                                 log_error("Failed to parse host port: %s", optarg);
715                                 return -EINVAL;
716                         }
717
718                         LIST_FOREACH(ports, p, arg_expose_ports) {
719                                 if (p->protocol == protocol && p->host_port == host_port) {
720                                         log_error("Duplicate port specification: %s", optarg);
721                                         return -EINVAL;
722                                 }
723                         }
724
725                         p = new(ExposePort, 1);
726                         if (!p)
727                                 return log_oom();
728
729                         p->protocol = protocol;
730                         p->host_port = host_port;
731                         p->container_port = container_port;
732
733                         LIST_PREPEND(ports, arg_expose_ports, p);
734
735                         break;
736                 }
737
738                 case ARG_PROPERTY:
739                         if (strv_extend(&arg_property, optarg) < 0)
740                                 return log_oom();
741
742                         break;
743
744                 case '?':
745                         return -EINVAL;
746
747                 default:
748                         assert_not_reached("Unhandled option");
749                 }
750
751         if (arg_share_system)
752                 arg_register = false;
753
754         if (arg_boot && arg_share_system) {
755                 log_error("--boot and --share-system may not be combined.");
756                 return -EINVAL;
757         }
758
759         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
760                 log_error("--keep-unit may not be used when invoked from a user session.");
761                 return -EINVAL;
762         }
763
764         if (arg_directory && arg_image) {
765                 log_error("--directory= and --image= may not be combined.");
766                 return -EINVAL;
767         }
768
769         if (arg_template && arg_image) {
770                 log_error("--template= and --image= may not be combined.");
771                 return -EINVAL;
772         }
773
774         if (arg_template && !(arg_directory || arg_machine)) {
775                 log_error("--template= needs --directory= or --machine=.");
776                 return -EINVAL;
777         }
778
779         if (arg_ephemeral && arg_template) {
780                 log_error("--ephemeral and --template= may not be combined.");
781                 return -EINVAL;
782         }
783
784         if (arg_ephemeral && arg_image) {
785                 log_error("--ephemeral and --image= may not be combined.");
786                 return -EINVAL;
787         }
788
789         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
790                 log_error("--ephemeral and --link-journal= may not be combined.");
791                 return -EINVAL;
792         }
793
794         if (arg_volatile != VOLATILE_NO && arg_read_only) {
795                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
796                 return -EINVAL;
797         }
798
799         if (arg_expose_ports && !arg_private_network) {
800                 log_error("Cannot use --port= without private networking.");
801                 return -EINVAL;
802         }
803
804         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
805
806         return 1;
807 }
808
809 static int mount_all(const char *dest) {
810
811         typedef struct MountPoint {
812                 const char *what;
813                 const char *where;
814                 const char *type;
815                 const char *options;
816                 unsigned long flags;
817                 bool fatal;
818         } MountPoint;
819
820         static const MountPoint mount_table[] = {
821                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
822                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
823                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
824                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
825                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
826                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
827                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
828                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
829                 { "tmpfs",     "/tmp",      "tmpfs", "mode=1777", MS_STRICTATIME,                         true  },
830 #ifdef HAVE_SELINUX
831                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
832                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
833 #endif
834         };
835
836         unsigned k;
837         int r = 0;
838
839         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
840                 _cleanup_free_ char *where = NULL;
841 #ifdef HAVE_SELINUX
842                 _cleanup_free_ char *options = NULL;
843 #endif
844                 const char *o;
845                 int t;
846
847                 where = strjoin(dest, "/", mount_table[k].where, NULL);
848                 if (!where)
849                         return log_oom();
850
851                 t = path_is_mount_point(where, true);
852                 if (t < 0) {
853                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
854
855                         if (r == 0)
856                                 r = t;
857
858                         continue;
859                 }
860
861                 /* Skip this entry if it is not a remount. */
862                 if (mount_table[k].what && t > 0)
863                         continue;
864
865                 t = mkdir_p(where, 0755);
866                 if (t < 0) {
867                         if (mount_table[k].fatal) {
868                                log_error_errno(t, "Failed to create directory %s: %m", where);
869
870                                 if (r == 0)
871                                         r = t;
872                         } else
873                                log_warning_errno(t, "Failed to create directory %s: %m", where);
874
875                         continue;
876                 }
877
878 #ifdef HAVE_SELINUX
879                 if (arg_selinux_apifs_context &&
880                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
881                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
882                         if (!options)
883                                 return log_oom();
884
885                         o = options;
886                 } else
887 #endif
888                         o = mount_table[k].options;
889
890
891                 if (mount(mount_table[k].what,
892                           where,
893                           mount_table[k].type,
894                           mount_table[k].flags,
895                           o) < 0) {
896
897                         if (mount_table[k].fatal) {
898                                 log_error_errno(errno, "mount(%s) failed: %m", where);
899
900                                 if (r == 0)
901                                         r = -errno;
902                         } else
903                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
904                 }
905         }
906
907         return r;
908 }
909
910 static int mount_binds(const char *dest, char **l, bool ro) {
911         char **x, **y;
912
913         STRV_FOREACH_PAIR(x, y, l) {
914                 _cleanup_free_ char *where = NULL;
915                 struct stat source_st, dest_st;
916                 int r;
917
918                 if (stat(*x, &source_st) < 0)
919                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
920
921                 where = strappend(dest, *y);
922                 if (!where)
923                         return log_oom();
924
925                 r = stat(where, &dest_st);
926                 if (r == 0) {
927                         if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
928                                 log_error("Cannot bind mount directory %s on file %s.", *x, where);
929                                 return -EINVAL;
930                         }
931                         if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
932                                 log_error("Cannot bind mount file %s on directory %s.", *x, where);
933                                 return -EINVAL;
934                         }
935                 } else if (errno == ENOENT) {
936                         r = mkdir_parents_label(where, 0755);
937                         if (r < 0)
938                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
939                 } else {
940                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
941                         return -errno;
942                 }
943
944                 /* Create the mount point. Any non-directory file can be
945                  * mounted on any non-directory file (regular, fifo, socket,
946                  * char, block).
947                  */
948                 if (S_ISDIR(source_st.st_mode)) {
949                         r = mkdir_label(where, 0755);
950                         if (r < 0 && errno != EEXIST)
951                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
952                 } else {
953                         r = touch(where);
954                         if (r < 0)
955                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
956                 }
957
958                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
959                         return log_error_errno(errno, "mount(%s) failed: %m", where);
960
961                 if (ro) {
962                         r = bind_remount_recursive(where, true);
963                         if (r < 0)
964                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
965                 }
966         }
967
968         return 0;
969 }
970
971 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
972         char *to;
973         int r;
974
975         to = strjoina(dest, "/sys/fs/cgroup/", hierarchy);
976
977         r = path_is_mount_point(to, false);
978         if (r < 0)
979                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
980         if (r > 0)
981                 return 0;
982
983         mkdir_p(to, 0755);
984
985         /* The superblock mount options of the mount point need to be
986          * identical to the hosts', and hence writable... */
987         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV, controller) < 0)
988                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
989
990         /* ... hence let's only make the bind mount read-only, not the
991          * superblock. */
992         if (read_only) {
993                 if (mount(NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
994                         return log_error_errno(errno, "Failed to remount %s read-only: %m", to);
995         }
996         return 1;
997 }
998
999 static int mount_cgroup(const char *dest) {
1000         _cleanup_set_free_free_ Set *controllers = NULL;
1001         _cleanup_free_ char *own_cgroup_path = NULL;
1002         const char *cgroup_root, *systemd_root, *systemd_own;
1003         int r;
1004
1005         controllers = set_new(&string_hash_ops);
1006         if (!controllers)
1007                 return log_oom();
1008
1009         r = cg_kernel_controllers(controllers);
1010         if (r < 0)
1011                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
1012
1013         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
1014         if (r < 0)
1015                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
1016
1017         cgroup_root = strjoina(dest, "/sys/fs/cgroup");
1018         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
1019                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
1020
1021         for (;;) {
1022                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
1023
1024                 controller = set_steal_first(controllers);
1025                 if (!controller)
1026                         break;
1027
1028                 origin = strappend("/sys/fs/cgroup/", controller);
1029                 if (!origin)
1030                         return log_oom();
1031
1032                 r = readlink_malloc(origin, &combined);
1033                 if (r == -EINVAL) {
1034                         /* Not a symbolic link, but directly a single cgroup hierarchy */
1035
1036                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
1037                         if (r < 0)
1038                                 return r;
1039
1040                 } else if (r < 0)
1041                         return log_error_errno(r, "Failed to read link %s: %m", origin);
1042                 else {
1043                         _cleanup_free_ char *target = NULL;
1044
1045                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
1046                         if (!target)
1047                                 return log_oom();
1048
1049                         /* A symbolic link, a combination of controllers in one hierarchy */
1050
1051                         if (!filename_is_valid(combined)) {
1052                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
1053                                 continue;
1054                         }
1055
1056                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
1057                         if (r < 0)
1058                                 return r;
1059
1060                         if (symlink(combined, target) < 0)
1061                                 return log_error_errno(errno, "Failed to create symlink for combined hierarchy: %m");
1062                 }
1063         }
1064
1065         r = mount_cgroup_hierarchy(dest, "name=systemd,xattr", "systemd", false);
1066         if (r < 0)
1067                 return r;
1068
1069         /* Make our own cgroup a (writable) bind mount */
1070         systemd_own = strjoina(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
1071         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
1072                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
1073
1074         /* And then remount the systemd cgroup root read-only */
1075         systemd_root = strjoina(dest, "/sys/fs/cgroup/systemd");
1076         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
1077                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
1078
1079         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
1080                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
1081
1082         return 0;
1083 }
1084
1085 static int mount_tmpfs(const char *dest) {
1086         char **i, **o;
1087
1088         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
1089                 _cleanup_free_ char *where = NULL;
1090                 int r;
1091
1092                 where = strappend(dest, *i);
1093                 if (!where)
1094                         return log_oom();
1095
1096                 r = mkdir_label(where, 0755);
1097                 if (r < 0 && r != -EEXIST)
1098                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
1099
1100                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
1101                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
1102         }
1103
1104         return 0;
1105 }
1106
1107 static int setup_timezone(const char *dest) {
1108         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1109         char *z, *y;
1110         int r;
1111
1112         assert(dest);
1113
1114         /* Fix the timezone, if possible */
1115         r = readlink_malloc("/etc/localtime", &p);
1116         if (r < 0) {
1117                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1118                 return 0;
1119         }
1120
1121         z = path_startswith(p, "../usr/share/zoneinfo/");
1122         if (!z)
1123                 z = path_startswith(p, "/usr/share/zoneinfo/");
1124         if (!z) {
1125                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1126                 return 0;
1127         }
1128
1129         where = strappend(dest, "/etc/localtime");
1130         if (!where)
1131                 return log_oom();
1132
1133         r = readlink_malloc(where, &q);
1134         if (r >= 0) {
1135                 y = path_startswith(q, "../usr/share/zoneinfo/");
1136                 if (!y)
1137                         y = path_startswith(q, "/usr/share/zoneinfo/");
1138
1139                 /* Already pointing to the right place? Then do nothing .. */
1140                 if (y && streq(y, z))
1141                         return 0;
1142         }
1143
1144         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1145         if (!check)
1146                 return log_oom();
1147
1148         if (access(check, F_OK) < 0) {
1149                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1150                 return 0;
1151         }
1152
1153         what = strappend("../usr/share/zoneinfo/", z);
1154         if (!what)
1155                 return log_oom();
1156
1157         r = mkdir_parents(where, 0755);
1158         if (r < 0) {
1159                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1160
1161                 return 0;
1162         }
1163
1164         r = unlink(where);
1165         if (r < 0 && errno != ENOENT) {
1166                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1167
1168                 return 0;
1169         }
1170
1171         if (symlink(what, where) < 0) {
1172                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1173                 return 0;
1174         }
1175
1176         return 0;
1177 }
1178
1179 static int setup_resolv_conf(const char *dest) {
1180         _cleanup_free_ char *where = NULL;
1181         int r;
1182
1183         assert(dest);
1184
1185         if (arg_private_network)
1186                 return 0;
1187
1188         /* Fix resolv.conf, if possible */
1189         where = strappend(dest, "/etc/resolv.conf");
1190         if (!where)
1191                 return log_oom();
1192
1193         /* We don't really care for the results of this really. If it
1194          * fails, it fails, but meh... */
1195         r = mkdir_parents(where, 0755);
1196         if (r < 0) {
1197                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1198
1199                 return 0;
1200         }
1201
1202         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644, 0);
1203         if (r < 0) {
1204                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1205
1206                 return 0;
1207         }
1208
1209         return 0;
1210 }
1211
1212 static int setup_volatile_state(const char *directory) {
1213         const char *p;
1214         int r;
1215
1216         assert(directory);
1217
1218         if (arg_volatile != VOLATILE_STATE)
1219                 return 0;
1220
1221         /* --volatile=state means we simply overmount /var
1222            with a tmpfs, and the rest read-only. */
1223
1224         r = bind_remount_recursive(directory, true);
1225         if (r < 0)
1226                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1227
1228         p = strjoina(directory, "/var");
1229         r = mkdir(p, 0755);
1230         if (r < 0 && errno != EEXIST)
1231                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1232
1233         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1234                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1235
1236         return 0;
1237 }
1238
1239 static int setup_volatile(const char *directory) {
1240         bool tmpfs_mounted = false, bind_mounted = false;
1241         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1242         const char *f, *t;
1243         int r;
1244
1245         assert(directory);
1246
1247         if (arg_volatile != VOLATILE_YES)
1248                 return 0;
1249
1250         /* --volatile=yes means we mount a tmpfs to the root dir, and
1251            the original /usr to use inside it, and that read-only. */
1252
1253         if (!mkdtemp(template))
1254                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1255
1256         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1257                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1258                 r = -errno;
1259                 goto fail;
1260         }
1261
1262         tmpfs_mounted = true;
1263
1264         f = strjoina(directory, "/usr");
1265         t = strjoina(template, "/usr");
1266
1267         r = mkdir(t, 0755);
1268         if (r < 0 && errno != EEXIST) {
1269                 log_error_errno(errno, "Failed to create %s: %m", t);
1270                 r = -errno;
1271                 goto fail;
1272         }
1273
1274         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1275                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1276                 r = -errno;
1277                 goto fail;
1278         }
1279
1280         bind_mounted = true;
1281
1282         r = bind_remount_recursive(t, true);
1283         if (r < 0) {
1284                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1285                 goto fail;
1286         }
1287
1288         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1289                 log_error_errno(errno, "Failed to move root mount: %m");
1290                 r = -errno;
1291                 goto fail;
1292         }
1293
1294         rmdir(template);
1295
1296         return 0;
1297
1298 fail:
1299         if (bind_mounted)
1300                 umount(t);
1301         if (tmpfs_mounted)
1302                 umount(template);
1303         rmdir(template);
1304         return r;
1305 }
1306
1307 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1308
1309         snprintf(s, 37,
1310                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1311                  SD_ID128_FORMAT_VAL(id));
1312
1313         return s;
1314 }
1315
1316 static int setup_boot_id(const char *dest) {
1317         _cleanup_free_ char *from = NULL, *to = NULL;
1318         sd_id128_t rnd = {};
1319         char as_uuid[37];
1320         int r;
1321
1322         assert(dest);
1323
1324         if (arg_share_system)
1325                 return 0;
1326
1327         /* Generate a new randomized boot ID, so that each boot-up of
1328          * the container gets a new one */
1329
1330         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1331         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1332         if (!from || !to)
1333                 return log_oom();
1334
1335         r = sd_id128_randomize(&rnd);
1336         if (r < 0)
1337                 return log_error_errno(r, "Failed to generate random boot id: %m");
1338
1339         id128_format_as_uuid(rnd, as_uuid);
1340
1341         r = write_string_file(from, as_uuid);
1342         if (r < 0)
1343                 return log_error_errno(r, "Failed to write boot id: %m");
1344
1345         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1346                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1347                 r = -errno;
1348         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1349                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1350
1351         unlink(from);
1352         return r;
1353 }
1354
1355 static int copy_devnodes(const char *dest) {
1356
1357         static const char devnodes[] =
1358                 "null\0"
1359                 "zero\0"
1360                 "full\0"
1361                 "random\0"
1362                 "urandom\0"
1363                 "tty\0"
1364                 "net/tun\0";
1365
1366         const char *d;
1367         int r = 0;
1368         _cleanup_umask_ mode_t u;
1369
1370         assert(dest);
1371
1372         u = umask(0000);
1373
1374         NULSTR_FOREACH(d, devnodes) {
1375                 _cleanup_free_ char *from = NULL, *to = NULL;
1376                 struct stat st;
1377
1378                 from = strappend("/dev/", d);
1379                 to = strjoin(dest, "/dev/", d, NULL);
1380                 if (!from || !to)
1381                         return log_oom();
1382
1383                 if (stat(from, &st) < 0) {
1384
1385                         if (errno != ENOENT)
1386                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1387
1388                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1389
1390                         log_error("%s is not a char or block device, cannot copy", from);
1391                         return -EIO;
1392
1393                 } else {
1394                         r = mkdir_parents(to, 0775);
1395                         if (r < 0) {
1396                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1397                                 return -r;
1398                         }
1399
1400                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1401                                 return log_error_errno(errno, "mknod(%s) failed: %m", to);
1402                 }
1403         }
1404
1405         return r;
1406 }
1407
1408 static int setup_ptmx(const char *dest) {
1409         _cleanup_free_ char *p = NULL;
1410
1411         p = strappend(dest, "/dev/ptmx");
1412         if (!p)
1413                 return log_oom();
1414
1415         if (symlink("pts/ptmx", p) < 0)
1416                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1417
1418         return 0;
1419 }
1420
1421 static int setup_dev_console(const char *dest, const char *console) {
1422         _cleanup_umask_ mode_t u;
1423         const char *to;
1424         struct stat st;
1425         int r;
1426
1427         assert(dest);
1428         assert(console);
1429
1430         u = umask(0000);
1431
1432         if (stat("/dev/null", &st) < 0)
1433                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1434
1435         r = chmod_and_chown(console, 0600, 0, 0);
1436         if (r < 0)
1437                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1438
1439         /* We need to bind mount the right tty to /dev/console since
1440          * ptys can only exist on pts file systems. To have something
1441          * to bind mount things on we create a device node first, and
1442          * use /dev/null for that since we the cgroups device policy
1443          * allows us to create that freely, while we cannot create
1444          * /dev/console. (Note that the major minor doesn't actually
1445          * matter here, since we mount it over anyway). */
1446
1447         to = strjoina(dest, "/dev/console");
1448         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1449                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1450
1451         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1452                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1453
1454         return 0;
1455 }
1456
1457 static int setup_kmsg(const char *dest, int kmsg_socket) {
1458         _cleanup_free_ char *from = NULL, *to = NULL;
1459         _cleanup_umask_ mode_t u;
1460         int r, fd, k;
1461         union {
1462                 struct cmsghdr cmsghdr;
1463                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1464         } control = {};
1465         struct msghdr mh = {
1466                 .msg_control = &control,
1467                 .msg_controllen = sizeof(control),
1468         };
1469         struct cmsghdr *cmsg;
1470
1471         assert(dest);
1472         assert(kmsg_socket >= 0);
1473
1474         u = umask(0000);
1475
1476         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1477          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1478          * on the reading side behave very similar to /proc/kmsg,
1479          * their writing side behaves differently from /dev/kmsg in
1480          * that writing blocks when nothing is reading. In order to
1481          * avoid any problems with containers deadlocking due to this
1482          * we simply make /dev/kmsg unavailable to the container. */
1483         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1484             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1485                 return log_oom();
1486
1487         if (mkfifo(from, 0600) < 0)
1488                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1489
1490         r = chmod_and_chown(from, 0600, 0, 0);
1491         if (r < 0)
1492                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1493
1494         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1495                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1496
1497         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1498         if (fd < 0)
1499                 return log_error_errno(errno, "Failed to open fifo: %m");
1500
1501         cmsg = CMSG_FIRSTHDR(&mh);
1502         cmsg->cmsg_level = SOL_SOCKET;
1503         cmsg->cmsg_type = SCM_RIGHTS;
1504         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1505         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1506
1507         mh.msg_controllen = cmsg->cmsg_len;
1508
1509         /* Store away the fd in the socket, so that it stays open as
1510          * long as we run the child */
1511         k = sendmsg(kmsg_socket, &mh, MSG_NOSIGNAL);
1512         safe_close(fd);
1513
1514         if (k < 0)
1515                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1516
1517         /* And now make the FIFO unavailable as /dev/kmsg... */
1518         unlink(from);
1519         return 0;
1520 }
1521
1522 static int send_rtnl(int send_fd) {
1523         union {
1524                 struct cmsghdr cmsghdr;
1525                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1526         } control = {};
1527         struct msghdr mh = {
1528                 .msg_control = &control,
1529                 .msg_controllen = sizeof(control),
1530         };
1531         struct cmsghdr *cmsg;
1532         _cleanup_close_ int fd = -1;
1533         ssize_t k;
1534
1535         assert(send_fd >= 0);
1536
1537         if (!arg_expose_ports)
1538                 return 0;
1539
1540         fd = socket(PF_NETLINK, SOCK_RAW|SOCK_CLOEXEC|SOCK_NONBLOCK, NETLINK_ROUTE);
1541         if (fd < 0)
1542                 return log_error_errno(errno, "failed to allocate container netlink: %m");
1543
1544         cmsg = CMSG_FIRSTHDR(&mh);
1545         cmsg->cmsg_level = SOL_SOCKET;
1546         cmsg->cmsg_type = SCM_RIGHTS;
1547         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1548         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1549
1550         mh.msg_controllen = cmsg->cmsg_len;
1551
1552         /* Store away the fd in the socket, so that it stays open as
1553          * long as we run the child */
1554         k = sendmsg(send_fd, &mh, MSG_NOSIGNAL);
1555         if (k < 0)
1556                 return log_error_errno(errno, "Failed to send netlink fd: %m");
1557
1558         return 0;
1559 }
1560
1561 static int flush_ports(union in_addr_union *exposed) {
1562         ExposePort *p;
1563         int r, af = AF_INET;
1564
1565         assert(exposed);
1566
1567         if (!arg_expose_ports)
1568                 return 0;
1569
1570         if (in_addr_is_null(af, exposed))
1571                 return 0;
1572
1573         log_debug("Lost IP address.");
1574
1575         LIST_FOREACH(ports, p, arg_expose_ports) {
1576                 r = fw_add_local_dnat(false,
1577                                       af,
1578                                       p->protocol,
1579                                       NULL,
1580                                       NULL, 0,
1581                                       NULL, 0,
1582                                       p->host_port,
1583                                       exposed,
1584                                       p->container_port,
1585                                       NULL);
1586                 if (r < 0)
1587                         log_warning_errno(r, "Failed to modify firewall: %m");
1588         }
1589
1590         *exposed = IN_ADDR_NULL;
1591         return 0;
1592 }
1593
1594 static int expose_ports(sd_rtnl *rtnl, union in_addr_union *exposed) {
1595         _cleanup_free_ struct local_address *addresses = NULL;
1596         _cleanup_free_ char *pretty = NULL;
1597         union in_addr_union new_exposed;
1598         ExposePort *p;
1599         bool add;
1600         int af = AF_INET, r;
1601
1602         assert(exposed);
1603
1604         /* Invoked each time an address is added or removed inside the
1605          * container */
1606
1607         if (!arg_expose_ports)
1608                 return 0;
1609
1610         r = local_addresses(rtnl, 0, af, &addresses);
1611         if (r < 0)
1612                 return log_error_errno(r, "Failed to enumerate local addresses: %m");
1613
1614         add = r > 0 &&
1615                 addresses[0].family == af &&
1616                 addresses[0].scope < RT_SCOPE_LINK;
1617
1618         if (!add)
1619                 return flush_ports(exposed);
1620
1621         new_exposed = addresses[0].address;
1622         if (in_addr_equal(af, exposed, &new_exposed))
1623                 return 0;
1624
1625         in_addr_to_string(af, &new_exposed, &pretty);
1626         log_debug("New container IP is %s.", strna(pretty));
1627
1628         LIST_FOREACH(ports, p, arg_expose_ports) {
1629
1630                 r = fw_add_local_dnat(true,
1631                                       af,
1632                                       p->protocol,
1633                                       NULL,
1634                                       NULL, 0,
1635                                       NULL, 0,
1636                                       p->host_port,
1637                                       &new_exposed,
1638                                       p->container_port,
1639                                       in_addr_is_null(af, exposed) ? NULL : exposed);
1640                 if (r < 0)
1641                         log_warning_errno(r, "Failed to modify firewall: %m");
1642         }
1643
1644         *exposed = new_exposed;
1645         return 0;
1646 }
1647
1648 static int on_address_change(sd_rtnl *rtnl, sd_rtnl_message *m, void *userdata) {
1649         union in_addr_union *exposed = userdata;
1650
1651         assert(rtnl);
1652         assert(m);
1653         assert(exposed);
1654
1655         expose_ports(rtnl, exposed);
1656         return 0;
1657 }
1658
1659 static int watch_rtnl(sd_event *event, int recv_fd, union in_addr_union *exposed, sd_rtnl **ret) {
1660         union {
1661                 struct cmsghdr cmsghdr;
1662                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1663         } control = {};
1664         struct msghdr mh = {
1665                 .msg_control = &control,
1666                 .msg_controllen = sizeof(control),
1667         };
1668         struct cmsghdr *cmsg;
1669         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1670         int fd, r;
1671         ssize_t k;
1672
1673         assert(event);
1674         assert(recv_fd >= 0);
1675         assert(ret);
1676
1677         if (!arg_expose_ports)
1678                 return 0;
1679
1680         k = recvmsg(recv_fd, &mh, MSG_NOSIGNAL);
1681         if (k < 0)
1682                 return log_error_errno(errno, "Failed to recv netlink fd: %m");
1683
1684         cmsg = CMSG_FIRSTHDR(&mh);
1685         assert(cmsg->cmsg_level == SOL_SOCKET);
1686         assert(cmsg->cmsg_type == SCM_RIGHTS);
1687         assert(cmsg->cmsg_len == CMSG_LEN(sizeof(int)));
1688         memcpy(&fd, CMSG_DATA(cmsg), sizeof(int));
1689
1690         r = sd_rtnl_open_fd(&rtnl, fd, 1, RTNLGRP_IPV4_IFADDR);
1691         if (r < 0) {
1692                 safe_close(fd);
1693                 return log_error_errno(r, "Failed to create rtnl object: %m");
1694         }
1695
1696         r = sd_rtnl_add_match(rtnl, RTM_NEWADDR, on_address_change, exposed);
1697         if (r < 0)
1698                 return log_error_errno(r, "Failed to subscribe to RTM_NEWADDR messages: %m");
1699
1700         r = sd_rtnl_add_match(rtnl, RTM_DELADDR, on_address_change, exposed);
1701         if (r < 0)
1702                 return log_error_errno(r, "Failed to subscribe to RTM_DELADDR messages: %m");
1703
1704         r = sd_rtnl_attach_event(rtnl, event, 0);
1705         if (r < 0)
1706                 return log_error_errno(r, "Failed to add to even loop: %m");
1707
1708         *ret = rtnl;
1709         rtnl = NULL;
1710
1711         return 0;
1712 }
1713
1714 static int setup_hostname(void) {
1715
1716         if (arg_share_system)
1717                 return 0;
1718
1719         if (sethostname_idempotent(arg_machine) < 0)
1720                 return -errno;
1721
1722         return 0;
1723 }
1724
1725 static int setup_journal(const char *directory) {
1726         sd_id128_t machine_id, this_id;
1727         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1728         char *id;
1729         int r;
1730
1731         /* Don't link journals in ephemeral mode */
1732         if (arg_ephemeral)
1733                 return 0;
1734
1735         p = strappend(directory, "/etc/machine-id");
1736         if (!p)
1737                 return log_oom();
1738
1739         r = read_one_line_file(p, &b);
1740         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1741                 return 0;
1742         else if (r < 0)
1743                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1744
1745         id = strstrip(b);
1746         if (isempty(id) && arg_link_journal == LINK_AUTO)
1747                 return 0;
1748
1749         /* Verify validity */
1750         r = sd_id128_from_string(id, &machine_id);
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1753
1754         r = sd_id128_get_machine(&this_id);
1755         if (r < 0)
1756                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1757
1758         if (sd_id128_equal(machine_id, this_id)) {
1759                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1760                          "Host and machine ids are equal (%s): refusing to link journals", id);
1761                 if (arg_link_journal == LINK_AUTO)
1762                         return 0;
1763                 return -EEXIST;
1764         }
1765
1766         if (arg_link_journal == LINK_NO)
1767                 return 0;
1768
1769         free(p);
1770         p = strappend("/var/log/journal/", id);
1771         q = strjoin(directory, "/var/log/journal/", id, NULL);
1772         if (!p || !q)
1773                 return log_oom();
1774
1775         if (path_is_mount_point(p, false) > 0) {
1776                 if (arg_link_journal != LINK_AUTO) {
1777                         log_error("%s: already a mount point, refusing to use for journal", p);
1778                         return -EEXIST;
1779                 }
1780
1781                 return 0;
1782         }
1783
1784         if (path_is_mount_point(q, false) > 0) {
1785                 if (arg_link_journal != LINK_AUTO) {
1786                         log_error("%s: already a mount point, refusing to use for journal", q);
1787                         return -EEXIST;
1788                 }
1789
1790                 return 0;
1791         }
1792
1793         r = readlink_and_make_absolute(p, &d);
1794         if (r >= 0) {
1795                 if ((arg_link_journal == LINK_GUEST ||
1796                      arg_link_journal == LINK_AUTO) &&
1797                     path_equal(d, q)) {
1798
1799                         r = mkdir_p(q, 0755);
1800                         if (r < 0)
1801                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1802                         return 0;
1803                 }
1804
1805                 if (unlink(p) < 0)
1806                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1807         } else if (r == -EINVAL) {
1808
1809                 if (arg_link_journal == LINK_GUEST &&
1810                     rmdir(p) < 0) {
1811
1812                         if (errno == ENOTDIR) {
1813                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1814                                 return r;
1815                         } else {
1816                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1817                                 return -errno;
1818                         }
1819                 }
1820         } else if (r != -ENOENT) {
1821                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1822                 return r;
1823         }
1824
1825         if (arg_link_journal == LINK_GUEST) {
1826
1827                 if (symlink(q, p) < 0) {
1828                         if (arg_link_journal_try) {
1829                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1830                                 return 0;
1831                         } else {
1832                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1833                                 return -errno;
1834                         }
1835                 }
1836
1837                 r = mkdir_p(q, 0755);
1838                 if (r < 0)
1839                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1840                 return 0;
1841         }
1842
1843         if (arg_link_journal == LINK_HOST) {
1844                 /* don't create parents here -- if the host doesn't have
1845                  * permanent journal set up, don't force it here */
1846                 r = mkdir(p, 0755);
1847                 if (r < 0) {
1848                         if (arg_link_journal_try) {
1849                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1850                                 return 0;
1851                         } else {
1852                                 log_error_errno(errno, "Failed to create %s: %m", p);
1853                                 return r;
1854                         }
1855                 }
1856
1857         } else if (access(p, F_OK) < 0)
1858                 return 0;
1859
1860         if (dir_is_empty(q) == 0)
1861                 log_warning("%s is not empty, proceeding anyway.", q);
1862
1863         r = mkdir_p(q, 0755);
1864         if (r < 0) {
1865                 log_error_errno(errno, "Failed to create %s: %m", q);
1866                 return r;
1867         }
1868
1869         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1870                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1871
1872         return 0;
1873 }
1874
1875 static int drop_capabilities(void) {
1876         return capability_bounding_set_drop(~arg_retain, false);
1877 }
1878
1879 static int register_machine(pid_t pid, int local_ifindex) {
1880         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1881         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1882         int r;
1883
1884         if (!arg_register)
1885                 return 0;
1886
1887         r = sd_bus_default_system(&bus);
1888         if (r < 0)
1889                 return log_error_errno(r, "Failed to open system bus: %m");
1890
1891         if (arg_keep_unit) {
1892                 r = sd_bus_call_method(
1893                                 bus,
1894                                 "org.freedesktop.machine1",
1895                                 "/org/freedesktop/machine1",
1896                                 "org.freedesktop.machine1.Manager",
1897                                 "RegisterMachineWithNetwork",
1898                                 &error,
1899                                 NULL,
1900                                 "sayssusai",
1901                                 arg_machine,
1902                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1903                                 "nspawn",
1904                                 "container",
1905                                 (uint32_t) pid,
1906                                 strempty(arg_directory),
1907                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1908         } else {
1909                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1910                 char **i;
1911
1912                 r = sd_bus_message_new_method_call(
1913                                 bus,
1914                                 &m,
1915                                 "org.freedesktop.machine1",
1916                                 "/org/freedesktop/machine1",
1917                                 "org.freedesktop.machine1.Manager",
1918                                 "CreateMachineWithNetwork");
1919                 if (r < 0)
1920                         return bus_log_create_error(r);
1921
1922                 r = sd_bus_message_append(
1923                                 m,
1924                                 "sayssusai",
1925                                 arg_machine,
1926                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1927                                 "nspawn",
1928                                 "container",
1929                                 (uint32_t) pid,
1930                                 strempty(arg_directory),
1931                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1932                 if (r < 0)
1933                         return bus_log_create_error(r);
1934
1935                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1936                 if (r < 0)
1937                         return bus_log_create_error(r);
1938
1939                 if (!isempty(arg_slice)) {
1940                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1941                         if (r < 0)
1942                                 return bus_log_create_error(r);
1943                 }
1944
1945                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1946                 if (r < 0)
1947                         return bus_log_create_error(r);
1948
1949                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1950                                           /* Allow the container to
1951                                            * access and create the API
1952                                            * device nodes, so that
1953                                            * PrivateDevices= in the
1954                                            * container can work
1955                                            * fine */
1956                                           "/dev/null", "rwm",
1957                                           "/dev/zero", "rwm",
1958                                           "/dev/full", "rwm",
1959                                           "/dev/random", "rwm",
1960                                           "/dev/urandom", "rwm",
1961                                           "/dev/tty", "rwm",
1962                                           "/dev/net/tun", "rwm",
1963                                           /* Allow the container
1964                                            * access to ptys. However,
1965                                            * do not permit the
1966                                            * container to ever create
1967                                            * these device nodes. */
1968                                           "/dev/pts/ptmx", "rw",
1969                                           "char-pts", "rw");
1970                 if (r < 0)
1971                         return log_error_errno(r, "Failed to add device whitelist: %m");
1972
1973                 STRV_FOREACH(i, arg_property) {
1974                         r = sd_bus_message_open_container(m, 'r', "sv");
1975                         if (r < 0)
1976                                 return bus_log_create_error(r);
1977
1978                         r = bus_append_unit_property_assignment(m, *i);
1979                         if (r < 0)
1980                                 return r;
1981
1982                         r = sd_bus_message_close_container(m);
1983                         if (r < 0)
1984                                 return bus_log_create_error(r);
1985                 }
1986
1987                 r = sd_bus_message_close_container(m);
1988                 if (r < 0)
1989                         return bus_log_create_error(r);
1990
1991                 r = sd_bus_call(bus, m, 0, &error, NULL);
1992         }
1993
1994         if (r < 0) {
1995                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1996                 return r;
1997         }
1998
1999         return 0;
2000 }
2001
2002 static int terminate_machine(pid_t pid) {
2003         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
2004         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
2005         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
2006         const char *path;
2007         int r;
2008
2009         if (!arg_register)
2010                 return 0;
2011
2012         r = sd_bus_default_system(&bus);
2013         if (r < 0)
2014                 return log_error_errno(r, "Failed to open system bus: %m");
2015
2016         r = sd_bus_call_method(
2017                         bus,
2018                         "org.freedesktop.machine1",
2019                         "/org/freedesktop/machine1",
2020                         "org.freedesktop.machine1.Manager",
2021                         "GetMachineByPID",
2022                         &error,
2023                         &reply,
2024                         "u",
2025                         (uint32_t) pid);
2026         if (r < 0) {
2027                 /* Note that the machine might already have been
2028                  * cleaned up automatically, hence don't consider it a
2029                  * failure if we cannot get the machine object. */
2030                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
2031                 return 0;
2032         }
2033
2034         r = sd_bus_message_read(reply, "o", &path);
2035         if (r < 0)
2036                 return bus_log_parse_error(r);
2037
2038         r = sd_bus_call_method(
2039                         bus,
2040                         "org.freedesktop.machine1",
2041                         path,
2042                         "org.freedesktop.machine1.Machine",
2043                         "Terminate",
2044                         &error,
2045                         NULL,
2046                         NULL);
2047         if (r < 0) {
2048                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
2049                 return 0;
2050         }
2051
2052         return 0;
2053 }
2054
2055 static int reset_audit_loginuid(void) {
2056         _cleanup_free_ char *p = NULL;
2057         int r;
2058
2059         if (arg_share_system)
2060                 return 0;
2061
2062         r = read_one_line_file("/proc/self/loginuid", &p);
2063         if (r == -ENOENT)
2064                 return 0;
2065         if (r < 0)
2066                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2067
2068         /* Already reset? */
2069         if (streq(p, "4294967295"))
2070                 return 0;
2071
2072         r = write_string_file("/proc/self/loginuid", "4294967295");
2073         if (r < 0) {
2074                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
2075                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2076                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2077                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2078                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
2079
2080                 sleep(5);
2081         }
2082
2083         return 0;
2084 }
2085
2086 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
2087 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
2088 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
2089
2090 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
2091         uint8_t result[8];
2092         size_t l, sz;
2093         uint8_t *v, *i;
2094         int r;
2095
2096         l = strlen(arg_machine);
2097         sz = sizeof(sd_id128_t) + l;
2098         if (idx > 0)
2099                 sz += sizeof(idx);
2100
2101         v = alloca(sz);
2102
2103         /* fetch some persistent data unique to the host */
2104         r = sd_id128_get_machine((sd_id128_t*) v);
2105         if (r < 0)
2106                 return r;
2107
2108         /* combine with some data unique (on this host) to this
2109          * container instance */
2110         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
2111         if (idx > 0) {
2112                 idx = htole64(idx);
2113                 memcpy(i, &idx, sizeof(idx));
2114         }
2115
2116         /* Let's hash the host machine ID plus the container name. We
2117          * use a fixed, but originally randomly created hash key here. */
2118         siphash24(result, v, sz, hash_key.bytes);
2119
2120         assert_cc(ETH_ALEN <= sizeof(result));
2121         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
2122
2123         /* see eth_random_addr in the kernel */
2124         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
2125         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
2126
2127         return 0;
2128 }
2129
2130 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
2131         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2132         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2133         struct ether_addr mac_host, mac_container;
2134         int r, i;
2135
2136         if (!arg_private_network)
2137                 return 0;
2138
2139         if (!arg_network_veth)
2140                 return 0;
2141
2142         /* Use two different interface name prefixes depending whether
2143          * we are in bridge mode or not. */
2144         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
2145                  arg_network_bridge ? "vb" : "ve", arg_machine);
2146
2147         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
2148         if (r < 0)
2149                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
2150
2151         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
2152         if (r < 0)
2153                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
2154
2155         r = sd_rtnl_open(&rtnl, 0);
2156         if (r < 0)
2157                 return log_error_errno(r, "Failed to connect to netlink: %m");
2158
2159         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2160         if (r < 0)
2161                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2162
2163         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
2164         if (r < 0)
2165                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2166
2167         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
2168         if (r < 0)
2169                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2170
2171         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2172         if (r < 0)
2173                 return log_error_errno(r, "Failed to open netlink container: %m");
2174
2175         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
2176         if (r < 0)
2177                 return log_error_errno(r, "Failed to open netlink container: %m");
2178
2179         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
2180         if (r < 0)
2181                 return log_error_errno(r, "Failed to open netlink container: %m");
2182
2183         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
2184         if (r < 0)
2185                 return log_error_errno(r, "Failed to add netlink interface name: %m");
2186
2187         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
2188         if (r < 0)
2189                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
2190
2191         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2192         if (r < 0)
2193                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
2194
2195         r = sd_rtnl_message_close_container(m);
2196         if (r < 0)
2197                 return log_error_errno(r, "Failed to close netlink container: %m");
2198
2199         r = sd_rtnl_message_close_container(m);
2200         if (r < 0)
2201                 return log_error_errno(r, "Failed to close netlink container: %m");
2202
2203         r = sd_rtnl_message_close_container(m);
2204         if (r < 0)
2205                 return log_error_errno(r, "Failed to close netlink container: %m");
2206
2207         r = sd_rtnl_call(rtnl, m, 0, NULL);
2208         if (r < 0)
2209                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
2210
2211         i = (int) if_nametoindex(iface_name);
2212         if (i <= 0)
2213                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
2214
2215         *ifi = i;
2216
2217         return 0;
2218 }
2219
2220 static int setup_bridge(const char veth_name[], int *ifi) {
2221         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2222         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2223         int r, bridge;
2224
2225         if (!arg_private_network)
2226                 return 0;
2227
2228         if (!arg_network_veth)
2229                 return 0;
2230
2231         if (!arg_network_bridge)
2232                 return 0;
2233
2234         bridge = (int) if_nametoindex(arg_network_bridge);
2235         if (bridge <= 0)
2236                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
2237
2238         *ifi = bridge;
2239
2240         r = sd_rtnl_open(&rtnl, 0);
2241         if (r < 0)
2242                 return log_error_errno(r, "Failed to connect to netlink: %m");
2243
2244         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
2245         if (r < 0)
2246                 return log_error_errno(r, "Failed to allocate netlink message: %m");
2247
2248         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
2249         if (r < 0)
2250                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
2251
2252         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
2253         if (r < 0)
2254                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
2255
2256         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
2257         if (r < 0)
2258                 return log_error_errno(r, "Failed to add netlink master field: %m");
2259
2260         r = sd_rtnl_call(rtnl, m, 0, NULL);
2261         if (r < 0)
2262                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
2263
2264         return 0;
2265 }
2266
2267 static int parse_interface(struct udev *udev, const char *name) {
2268         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2269         char ifi_str[2 + DECIMAL_STR_MAX(int)];
2270         int ifi;
2271
2272         ifi = (int) if_nametoindex(name);
2273         if (ifi <= 0)
2274                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
2275
2276         sprintf(ifi_str, "n%i", ifi);
2277         d = udev_device_new_from_device_id(udev, ifi_str);
2278         if (!d)
2279                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
2280
2281         if (udev_device_get_is_initialized(d) <= 0) {
2282                 log_error("Network interface %s is not initialized yet.", name);
2283                 return -EBUSY;
2284         }
2285
2286         return ifi;
2287 }
2288
2289 static int move_network_interfaces(pid_t pid) {
2290         _cleanup_udev_unref_ struct udev *udev = NULL;
2291         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2292         char **i;
2293         int r;
2294
2295         if (!arg_private_network)
2296                 return 0;
2297
2298         if (strv_isempty(arg_network_interfaces))
2299                 return 0;
2300
2301         r = sd_rtnl_open(&rtnl, 0);
2302         if (r < 0)
2303                 return log_error_errno(r, "Failed to connect to netlink: %m");
2304
2305         udev = udev_new();
2306         if (!udev) {
2307                 log_error("Failed to connect to udev.");
2308                 return -ENOMEM;
2309         }
2310
2311         STRV_FOREACH(i, arg_network_interfaces) {
2312                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2313                 int ifi;
2314
2315                 ifi = parse_interface(udev, *i);
2316                 if (ifi < 0)
2317                         return ifi;
2318
2319                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2320                 if (r < 0)
2321                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2322
2323                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2324                 if (r < 0)
2325                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2326
2327                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2328                 if (r < 0)
2329                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2330         }
2331
2332         return 0;
2333 }
2334
2335 static int setup_macvlan(pid_t pid) {
2336         _cleanup_udev_unref_ struct udev *udev = NULL;
2337         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2338         unsigned idx = 0;
2339         char **i;
2340         int r;
2341
2342         if (!arg_private_network)
2343                 return 0;
2344
2345         if (strv_isempty(arg_network_macvlan))
2346                 return 0;
2347
2348         r = sd_rtnl_open(&rtnl, 0);
2349         if (r < 0)
2350                 return log_error_errno(r, "Failed to connect to netlink: %m");
2351
2352         udev = udev_new();
2353         if (!udev) {
2354                 log_error("Failed to connect to udev.");
2355                 return -ENOMEM;
2356         }
2357
2358         STRV_FOREACH(i, arg_network_macvlan) {
2359                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2360                 _cleanup_free_ char *n = NULL;
2361                 struct ether_addr mac;
2362                 int ifi;
2363
2364                 ifi = parse_interface(udev, *i);
2365                 if (ifi < 0)
2366                         return ifi;
2367
2368                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2369                 if (r < 0)
2370                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2371
2372                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2373                 if (r < 0)
2374                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2375
2376                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2377                 if (r < 0)
2378                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2379
2380                 n = strappend("mv-", *i);
2381                 if (!n)
2382                         return log_oom();
2383
2384                 strshorten(n, IFNAMSIZ-1);
2385
2386                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2387                 if (r < 0)
2388                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2389
2390                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2391                 if (r < 0)
2392                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2393
2394                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2395                 if (r < 0)
2396                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2397
2398                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2399                 if (r < 0)
2400                         return log_error_errno(r, "Failed to open netlink container: %m");
2401
2402                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2403                 if (r < 0)
2404                         return log_error_errno(r, "Failed to open netlink container: %m");
2405
2406                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2407                 if (r < 0)
2408                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2409
2410                 r = sd_rtnl_message_close_container(m);
2411                 if (r < 0)
2412                         return log_error_errno(r, "Failed to close netlink container: %m");
2413
2414                 r = sd_rtnl_message_close_container(m);
2415                 if (r < 0)
2416                         return log_error_errno(r, "Failed to close netlink container: %m");
2417
2418                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2419                 if (r < 0)
2420                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2421         }
2422
2423         return 0;
2424 }
2425
2426 static int setup_ipvlan(pid_t pid) {
2427         _cleanup_udev_unref_ struct udev *udev = NULL;
2428         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2429         char **i;
2430         int r;
2431
2432         if (!arg_private_network)
2433                 return 0;
2434
2435         if (strv_isempty(arg_network_ipvlan))
2436                 return 0;
2437
2438         r = sd_rtnl_open(&rtnl, 0);
2439         if (r < 0)
2440                 return log_error_errno(r, "Failed to connect to netlink: %m");
2441
2442         udev = udev_new();
2443         if (!udev) {
2444                 log_error("Failed to connect to udev.");
2445                 return -ENOMEM;
2446         }
2447
2448         STRV_FOREACH(i, arg_network_ipvlan) {
2449                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2450                 _cleanup_free_ char *n = NULL;
2451                 int ifi;
2452
2453                 ifi = parse_interface(udev, *i);
2454                 if (ifi < 0)
2455                         return ifi;
2456
2457                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2458                 if (r < 0)
2459                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2460
2461                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2462                 if (r < 0)
2463                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2464
2465                 n = strappend("iv-", *i);
2466                 if (!n)
2467                         return log_oom();
2468
2469                 strshorten(n, IFNAMSIZ-1);
2470
2471                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2472                 if (r < 0)
2473                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2474
2475                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2476                 if (r < 0)
2477                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2478
2479                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2480                 if (r < 0)
2481                         return log_error_errno(r, "Failed to open netlink container: %m");
2482
2483                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "ipvlan");
2484                 if (r < 0)
2485                         return log_error_errno(r, "Failed to open netlink container: %m");
2486
2487                 r = sd_rtnl_message_append_u16(m, IFLA_IPVLAN_MODE, IPVLAN_MODE_L2);
2488                 if (r < 0)
2489                         return log_error_errno(r, "Failed to add ipvlan mode: %m");
2490
2491                 r = sd_rtnl_message_close_container(m);
2492                 if (r < 0)
2493                         return log_error_errno(r, "Failed to close netlink container: %m");
2494
2495                 r = sd_rtnl_message_close_container(m);
2496                 if (r < 0)
2497                         return log_error_errno(r, "Failed to close netlink container: %m");
2498
2499                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2500                 if (r < 0)
2501                         return log_error_errno(r, "Failed to add new ipvlan interfaces: %m");
2502         }
2503
2504         return 0;
2505 }
2506
2507 static int setup_seccomp(void) {
2508
2509 #ifdef HAVE_SECCOMP
2510         static const int blacklist[] = {
2511                 SCMP_SYS(kexec_load),
2512                 SCMP_SYS(open_by_handle_at),
2513                 SCMP_SYS(iopl),
2514                 SCMP_SYS(ioperm),
2515                 SCMP_SYS(swapon),
2516                 SCMP_SYS(swapoff),
2517         };
2518
2519         static const int kmod_blacklist[] = {
2520                 SCMP_SYS(init_module),
2521                 SCMP_SYS(finit_module),
2522                 SCMP_SYS(delete_module),
2523         };
2524
2525         scmp_filter_ctx seccomp;
2526         unsigned i;
2527         int r;
2528
2529         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2530         if (!seccomp)
2531                 return log_oom();
2532
2533         r = seccomp_add_secondary_archs(seccomp);
2534         if (r < 0) {
2535                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2536                 goto finish;
2537         }
2538
2539         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2540                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2541                 if (r == -EFAULT)
2542                         continue; /* unknown syscall */
2543                 if (r < 0) {
2544                         log_error_errno(r, "Failed to block syscall: %m");
2545                         goto finish;
2546                 }
2547         }
2548
2549         /* If the CAP_SYS_MODULE capability is not requested then
2550          * we'll block the kmod syscalls too */
2551         if (!(arg_retain & (1ULL << CAP_SYS_MODULE))) {
2552                 for (i = 0; i < ELEMENTSOF(kmod_blacklist); i++) {
2553                         r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), kmod_blacklist[i], 0);
2554                         if (r == -EFAULT)
2555                                 continue; /* unknown syscall */
2556                         if (r < 0) {
2557                                 log_error_errno(r, "Failed to block syscall: %m");
2558                                 goto finish;
2559                         }
2560                 }
2561         }
2562
2563         /*
2564            Audit is broken in containers, much of the userspace audit
2565            hookup will fail if running inside a container. We don't
2566            care and just turn off creation of audit sockets.
2567
2568            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2569            with EAFNOSUPPORT which audit userspace uses as indication
2570            that audit is disabled in the kernel.
2571          */
2572
2573         r = seccomp_rule_add(
2574                         seccomp,
2575                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2576                         SCMP_SYS(socket),
2577                         2,
2578                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2579                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2580         if (r < 0) {
2581                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2582                 goto finish;
2583         }
2584
2585         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2586         if (r < 0) {
2587                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2588                 goto finish;
2589         }
2590
2591         r = seccomp_load(seccomp);
2592         if (r < 0)
2593                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2594
2595 finish:
2596         seccomp_release(seccomp);
2597         return r;
2598 #else
2599         return 0;
2600 #endif
2601
2602 }
2603
2604 static int setup_propagate(const char *root) {
2605         const char *p, *q;
2606
2607         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2608         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2609         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2610         (void) mkdir_p(p, 0600);
2611
2612         q = strjoina(root, "/run/systemd/nspawn/incoming");
2613         mkdir_parents(q, 0755);
2614         mkdir_p(q, 0600);
2615
2616         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2617                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2618
2619         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2620                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2621
2622         return 0;
2623 }
2624
2625 static int setup_image(char **device_path, int *loop_nr) {
2626         struct loop_info64 info = {
2627                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2628         };
2629         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2630         _cleanup_free_ char* loopdev = NULL;
2631         struct stat st;
2632         int r, nr;
2633
2634         assert(device_path);
2635         assert(loop_nr);
2636         assert(arg_image);
2637
2638         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2639         if (fd < 0)
2640                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2641
2642         if (fstat(fd, &st) < 0)
2643                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2644
2645         if (S_ISBLK(st.st_mode)) {
2646                 char *p;
2647
2648                 p = strdup(arg_image);
2649                 if (!p)
2650                         return log_oom();
2651
2652                 *device_path = p;
2653
2654                 *loop_nr = -1;
2655
2656                 r = fd;
2657                 fd = -1;
2658
2659                 return r;
2660         }
2661
2662         if (!S_ISREG(st.st_mode)) {
2663                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2664                 return -EINVAL;
2665         }
2666
2667         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2668         if (control < 0)
2669                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2670
2671         nr = ioctl(control, LOOP_CTL_GET_FREE);
2672         if (nr < 0)
2673                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2674
2675         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2676                 return log_oom();
2677
2678         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2679         if (loop < 0)
2680                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2681
2682         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2683                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2684
2685         if (arg_read_only)
2686                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2687
2688         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2689                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2690
2691         *device_path = loopdev;
2692         loopdev = NULL;
2693
2694         *loop_nr = nr;
2695
2696         r = loop;
2697         loop = -1;
2698
2699         return r;
2700 }
2701
2702 #define PARTITION_TABLE_BLURB \
2703         "Note that the disk image needs to either contain only a single MBR partition of\n" \
2704         "type 0x83 that is marked bootable, or a sinlge GPT partition of type" \
2705         "0FC63DAF-8483-4772-8E79-3D69D8477DE4 or follow\n" \
2706         "    http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/\n" \
2707         "to be bootable with systemd-nspawn."
2708
2709 static int dissect_image(
2710                 int fd,
2711                 char **root_device, bool *root_device_rw,
2712                 char **home_device, bool *home_device_rw,
2713                 char **srv_device, bool *srv_device_rw,
2714                 bool *secondary) {
2715
2716 #ifdef HAVE_BLKID
2717         int home_nr = -1, srv_nr = -1;
2718 #ifdef GPT_ROOT_NATIVE
2719         int root_nr = -1;
2720 #endif
2721 #ifdef GPT_ROOT_SECONDARY
2722         int secondary_root_nr = -1;
2723 #endif
2724         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL, *generic = NULL;
2725         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2726         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2727         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2728         _cleanup_udev_unref_ struct udev *udev = NULL;
2729         struct udev_list_entry *first, *item;
2730         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true, generic_rw = true;
2731         bool is_gpt, is_mbr, multiple_generic = false;
2732         const char *pttype = NULL;
2733         blkid_partlist pl;
2734         struct stat st;
2735         unsigned i;
2736         int r;
2737
2738         assert(fd >= 0);
2739         assert(root_device);
2740         assert(home_device);
2741         assert(srv_device);
2742         assert(secondary);
2743         assert(arg_image);
2744
2745         b = blkid_new_probe();
2746         if (!b)
2747                 return log_oom();
2748
2749         errno = 0;
2750         r = blkid_probe_set_device(b, fd, 0, 0);
2751         if (r != 0) {
2752                 if (errno == 0)
2753                         return log_oom();
2754
2755                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2756                 return -errno;
2757         }
2758
2759         blkid_probe_enable_partitions(b, 1);
2760         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2761
2762         errno = 0;
2763         r = blkid_do_safeprobe(b);
2764         if (r == -2 || r == 1) {
2765                 log_error("Failed to identify any partition table on\n"
2766                           "    %s\n"
2767                           PARTITION_TABLE_BLURB, arg_image);
2768                 return -EINVAL;
2769         } else if (r != 0) {
2770                 if (errno == 0)
2771                         errno = EIO;
2772                 log_error_errno(errno, "Failed to probe: %m");
2773                 return -errno;
2774         }
2775
2776         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2777
2778         is_gpt = streq_ptr(pttype, "gpt");
2779         is_mbr = streq_ptr(pttype, "dos");
2780
2781         if (!is_gpt && !is_mbr) {
2782                 log_error("No GPT or MBR partition table discovered on\n"
2783                           "    %s\n"
2784                           PARTITION_TABLE_BLURB, arg_image);
2785                 return -EINVAL;
2786         }
2787
2788         errno = 0;
2789         pl = blkid_probe_get_partitions(b);
2790         if (!pl) {
2791                 if (errno == 0)
2792                         return log_oom();
2793
2794                 log_error("Failed to list partitions of %s", arg_image);
2795                 return -errno;
2796         }
2797
2798         udev = udev_new();
2799         if (!udev)
2800                 return log_oom();
2801
2802         if (fstat(fd, &st) < 0)
2803                 return log_error_errno(errno, "Failed to stat block device: %m");
2804
2805         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2806         if (!d)
2807                 return log_oom();
2808
2809         for (i = 0;; i++) {
2810                 int n, m;
2811
2812                 if (i >= 10) {
2813                         log_error("Kernel partitions never appeared.");
2814                         return -ENXIO;
2815                 }
2816
2817                 e = udev_enumerate_new(udev);
2818                 if (!e)
2819                         return log_oom();
2820
2821                 r = udev_enumerate_add_match_parent(e, d);
2822                 if (r < 0)
2823                         return log_oom();
2824
2825                 r = udev_enumerate_scan_devices(e);
2826                 if (r < 0)
2827                         return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2828
2829                 /* Count the partitions enumerated by the kernel */
2830                 n = 0;
2831                 first = udev_enumerate_get_list_entry(e);
2832                 udev_list_entry_foreach(item, first)
2833                         n++;
2834
2835                 /* Count the partitions enumerated by blkid */
2836                 m = blkid_partlist_numof_partitions(pl);
2837                 if (n == m + 1)
2838                         break;
2839                 if (n > m + 1) {
2840                         log_error("blkid and kernel partition list do not match.");
2841                         return -EIO;
2842                 }
2843                 if (n < m + 1) {
2844                         unsigned j;
2845
2846                         /* The kernel has probed fewer partitions than
2847                          * blkid? Maybe the kernel prober is still
2848                          * running or it got EBUSY because udev
2849                          * already opened the device. Let's reprobe
2850                          * the device, which is a synchronous call
2851                          * that waits until probing is complete. */
2852
2853                         for (j = 0; j < 20; j++) {
2854
2855                                 r = ioctl(fd, BLKRRPART, 0);
2856                                 if (r < 0)
2857                                         r = -errno;
2858                                 if (r >= 0 || r != -EBUSY)
2859                                         break;
2860
2861                                 /* If something else has the device
2862                                  * open, such as an udev rule, the
2863                                  * ioctl will return EBUSY. Since
2864                                  * there's no way to wait until it
2865                                  * isn't busy anymore, let's just wait
2866                                  * a bit, and try again.
2867                                  *
2868                                  * This is really something they
2869                                  * should fix in the kernel! */
2870
2871                                 usleep(50 * USEC_PER_MSEC);
2872                         }
2873
2874                         if (r < 0)
2875                                 return log_error_errno(r, "Failed to reread partition table: %m");
2876                 }
2877
2878                 e = udev_enumerate_unref(e);
2879         }
2880
2881         first = udev_enumerate_get_list_entry(e);
2882         udev_list_entry_foreach(item, first) {
2883                 _cleanup_udev_device_unref_ struct udev_device *q;
2884                 const char *node;
2885                 unsigned long long flags;
2886                 blkid_partition pp;
2887                 dev_t qn;
2888                 int nr;
2889
2890                 errno = 0;
2891                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2892                 if (!q) {
2893                         if (!errno)
2894                                 errno = ENOMEM;
2895
2896                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2897                         return -errno;
2898                 }
2899
2900                 qn = udev_device_get_devnum(q);
2901                 if (major(qn) == 0)
2902                         continue;
2903
2904                 if (st.st_rdev == qn)
2905                         continue;
2906
2907                 node = udev_device_get_devnode(q);
2908                 if (!node)
2909                         continue;
2910
2911                 pp = blkid_partlist_devno_to_partition(pl, qn);
2912                 if (!pp)
2913                         continue;
2914
2915                 flags = blkid_partition_get_flags(pp);
2916
2917                 nr = blkid_partition_get_partno(pp);
2918                 if (nr < 0)
2919                         continue;
2920
2921                 if (is_gpt) {
2922                         sd_id128_t type_id;
2923                         const char *stype;
2924
2925                         if (flags & GPT_FLAG_NO_AUTO)
2926                                 continue;
2927
2928                         stype = blkid_partition_get_type_string(pp);
2929                         if (!stype)
2930                                 continue;
2931
2932                         if (sd_id128_from_string(stype, &type_id) < 0)
2933                                 continue;
2934
2935                         if (sd_id128_equal(type_id, GPT_HOME)) {
2936
2937                                 if (home && nr >= home_nr)
2938                                         continue;
2939
2940                                 home_nr = nr;
2941                                 home_rw = !(flags & GPT_FLAG_READ_ONLY);
2942
2943                                 r = free_and_strdup(&home, node);
2944                                 if (r < 0)
2945                                         return log_oom();
2946
2947                         } else if (sd_id128_equal(type_id, GPT_SRV)) {
2948
2949                                 if (srv && nr >= srv_nr)
2950                                         continue;
2951
2952                                 srv_nr = nr;
2953                                 srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2954
2955                                 r = free_and_strdup(&srv, node);
2956                                 if (r < 0)
2957                                         return log_oom();
2958                         }
2959 #ifdef GPT_ROOT_NATIVE
2960                         else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2961
2962                                 if (root && nr >= root_nr)
2963                                         continue;
2964
2965                                 root_nr = nr;
2966                                 root_rw = !(flags & GPT_FLAG_READ_ONLY);
2967
2968                                 r = free_and_strdup(&root, node);
2969                                 if (r < 0)
2970                                         return log_oom();
2971                         }
2972 #endif
2973 #ifdef GPT_ROOT_SECONDARY
2974                         else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2975
2976                                 if (secondary_root && nr >= secondary_root_nr)
2977                                         continue;
2978
2979                                 secondary_root_nr = nr;
2980                                 secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2981
2982                                 r = free_and_strdup(&secondary_root, node);
2983                                 if (r < 0)
2984                                         return log_oom();
2985                         }
2986 #endif
2987                         else if (sd_id128_equal(type_id, GPT_LINUX_GENERIC)) {
2988
2989                                 if (generic)
2990                                         multiple_generic = true;
2991                                 else {
2992                                         generic_rw = !(flags & GPT_FLAG_READ_ONLY);
2993
2994                                         r = free_and_strdup(&generic, node);
2995                                         if (r < 0)
2996                                                 return log_oom();
2997                                 }
2998                         }
2999
3000                 } else if (is_mbr) {
3001                         int type;
3002
3003                         if (flags != 0x80) /* Bootable flag */
3004                                 continue;
3005
3006                         type = blkid_partition_get_type(pp);
3007                         if (type != 0x83) /* Linux partition */
3008                                 continue;
3009
3010                         if (generic)
3011                                 multiple_generic = true;
3012                         else {
3013                                 generic_rw = true;
3014
3015                                 r = free_and_strdup(&root, node);
3016                                 if (r < 0)
3017                                         return log_oom();
3018                         }
3019                 }
3020         }
3021
3022         if (root) {
3023                 *root_device = root;
3024                 root = NULL;
3025
3026                 *root_device_rw = root_rw;
3027                 *secondary = false;
3028         } else if (secondary_root) {
3029                 *root_device = secondary_root;
3030                 secondary_root = NULL;
3031
3032                 *root_device_rw = secondary_root_rw;
3033                 *secondary = true;
3034         } else if (generic) {
3035
3036                 /* There were no partitions with precise meanings
3037                  * around, but we found generic partitions. In this
3038                  * case, if there's only one, we can go ahead and boot
3039                  * it, otherwise we bail out, because we really cannot
3040                  * make any sense of it. */
3041
3042                 if (multiple_generic) {
3043                         log_error("Identified multiple bootable Linux partitions on\n"
3044                                   "    %s\n"
3045                                   PARTITION_TABLE_BLURB, arg_image);
3046                         return -EINVAL;
3047                 }
3048
3049                 *root_device = generic;
3050                 generic = NULL;
3051
3052                 *root_device_rw = generic_rw;
3053                 *secondary = false;
3054         } else {
3055                 log_error("Failed to identify root partition in disk image\n"
3056                           "    %s\n"
3057                           PARTITION_TABLE_BLURB, arg_image);
3058                 return -EINVAL;
3059         }
3060
3061         if (home) {
3062                 *home_device = home;
3063                 home = NULL;
3064
3065                 *home_device_rw = home_rw;
3066         }
3067
3068         if (srv) {
3069                 *srv_device = srv;
3070                 srv = NULL;
3071
3072                 *srv_device_rw = srv_rw;
3073         }
3074
3075         return 0;
3076 #else
3077         log_error("--image= is not supported, compiled without blkid support.");
3078         return -ENOTSUP;
3079 #endif
3080 }
3081
3082 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
3083 #ifdef HAVE_BLKID
3084         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
3085         const char *fstype, *p;
3086         int r;
3087
3088         assert(what);
3089         assert(where);
3090
3091         if (arg_read_only)
3092                 rw = false;
3093
3094         if (directory)
3095                 p = strjoina(where, directory);
3096         else
3097                 p = where;
3098
3099         errno = 0;
3100         b = blkid_new_probe_from_filename(what);
3101         if (!b) {
3102                 if (errno == 0)
3103                         return log_oom();
3104                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
3105                 return -errno;
3106         }
3107
3108         blkid_probe_enable_superblocks(b, 1);
3109         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
3110
3111         errno = 0;
3112         r = blkid_do_safeprobe(b);
3113         if (r == -1 || r == 1) {
3114                 log_error("Cannot determine file system type of %s", what);
3115                 return -EINVAL;
3116         } else if (r != 0) {
3117                 if (errno == 0)
3118                         errno = EIO;
3119                 log_error_errno(errno, "Failed to probe %s: %m", what);
3120                 return -errno;
3121         }
3122
3123         errno = 0;
3124         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
3125                 if (errno == 0)
3126                         errno = EINVAL;
3127                 log_error("Failed to determine file system type of %s", what);
3128                 return -errno;
3129         }
3130
3131         if (streq(fstype, "crypto_LUKS")) {
3132                 log_error("nspawn currently does not support LUKS disk images.");
3133                 return -ENOTSUP;
3134         }
3135
3136         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
3137                 return log_error_errno(errno, "Failed to mount %s: %m", what);
3138
3139         return 0;
3140 #else
3141         log_error("--image= is not supported, compiled without blkid support.");
3142         return -ENOTSUP;
3143 #endif
3144 }
3145
3146 static int mount_devices(
3147                 const char *where,
3148                 const char *root_device, bool root_device_rw,
3149                 const char *home_device, bool home_device_rw,
3150                 const char *srv_device, bool srv_device_rw) {
3151         int r;
3152
3153         assert(where);
3154
3155         if (root_device) {
3156                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
3157                 if (r < 0)
3158                         return log_error_errno(r, "Failed to mount root directory: %m");
3159         }
3160
3161         if (home_device) {
3162                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
3163                 if (r < 0)
3164                         return log_error_errno(r, "Failed to mount home directory: %m");
3165         }
3166
3167         if (srv_device) {
3168                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
3169                 if (r < 0)
3170                         return log_error_errno(r, "Failed to mount server data directory: %m");
3171         }
3172
3173         return 0;
3174 }
3175
3176 static void loop_remove(int nr, int *image_fd) {
3177         _cleanup_close_ int control = -1;
3178         int r;
3179
3180         if (nr < 0)
3181                 return;
3182
3183         if (image_fd && *image_fd >= 0) {
3184                 r = ioctl(*image_fd, LOOP_CLR_FD);
3185                 if (r < 0)
3186                         log_debug_errno(errno, "Failed to close loop image: %m");
3187                 *image_fd = safe_close(*image_fd);
3188         }
3189
3190         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
3191         if (control < 0) {
3192                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
3193                 return;
3194         }
3195
3196         r = ioctl(control, LOOP_CTL_REMOVE, nr);
3197         if (r < 0)
3198                 log_debug_errno(errno, "Failed to remove loop %d: %m", nr);
3199 }
3200
3201 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
3202         int pipe_fds[2];
3203         pid_t pid;
3204
3205         assert(database);
3206         assert(key);
3207         assert(rpid);
3208
3209         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
3210                 return log_error_errno(errno, "Failed to allocate pipe: %m");
3211
3212         pid = fork();
3213         if (pid < 0)
3214                 return log_error_errno(errno, "Failed to fork getent child: %m");
3215         else if (pid == 0) {
3216                 int nullfd;
3217                 char *empty_env = NULL;
3218
3219                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
3220                         _exit(EXIT_FAILURE);
3221
3222                 if (pipe_fds[0] > 2)
3223                         safe_close(pipe_fds[0]);
3224                 if (pipe_fds[1] > 2)
3225                         safe_close(pipe_fds[1]);
3226
3227                 nullfd = open("/dev/null", O_RDWR);
3228                 if (nullfd < 0)
3229                         _exit(EXIT_FAILURE);
3230
3231                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
3232                         _exit(EXIT_FAILURE);
3233
3234                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
3235                         _exit(EXIT_FAILURE);
3236
3237                 if (nullfd > 2)
3238                         safe_close(nullfd);
3239
3240                 reset_all_signal_handlers();
3241                 close_all_fds(NULL, 0);
3242
3243                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
3244                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
3245                 _exit(EXIT_FAILURE);
3246         }
3247
3248         pipe_fds[1] = safe_close(pipe_fds[1]);
3249
3250         *rpid = pid;
3251
3252         return pipe_fds[0];
3253 }
3254
3255 static int change_uid_gid(char **_home) {
3256         char line[LINE_MAX], *x, *u, *g, *h;
3257         const char *word, *state;
3258         _cleanup_free_ uid_t *uids = NULL;
3259         _cleanup_free_ char *home = NULL;
3260         _cleanup_fclose_ FILE *f = NULL;
3261         _cleanup_close_ int fd = -1;
3262         unsigned n_uids = 0;
3263         size_t sz = 0, l;
3264         uid_t uid;
3265         gid_t gid;
3266         pid_t pid;
3267         int r;
3268
3269         assert(_home);
3270
3271         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
3272                 /* Reset everything fully to 0, just in case */
3273
3274                 if (setgroups(0, NULL) < 0)
3275                         return log_error_errno(errno, "setgroups() failed: %m");
3276
3277                 if (setresgid(0, 0, 0) < 0)
3278                         return log_error_errno(errno, "setregid() failed: %m");
3279
3280                 if (setresuid(0, 0, 0) < 0)
3281                         return log_error_errno(errno, "setreuid() failed: %m");
3282
3283                 *_home = NULL;
3284                 return 0;
3285         }
3286
3287         /* First, get user credentials */
3288         fd = spawn_getent("passwd", arg_user, &pid);
3289         if (fd < 0)
3290                 return fd;
3291
3292         f = fdopen(fd, "r");
3293         if (!f)
3294                 return log_oom();
3295         fd = -1;
3296
3297         if (!fgets(line, sizeof(line), f)) {
3298
3299                 if (!ferror(f)) {
3300                         log_error("Failed to resolve user %s.", arg_user);
3301                         return -ESRCH;
3302                 }
3303
3304                 log_error_errno(errno, "Failed to read from getent: %m");
3305                 return -errno;
3306         }
3307
3308         truncate_nl(line);
3309
3310         wait_for_terminate_and_warn("getent passwd", pid, true);
3311
3312         x = strchr(line, ':');
3313         if (!x) {
3314                 log_error("/etc/passwd entry has invalid user field.");
3315                 return -EIO;
3316         }
3317
3318         u = strchr(x+1, ':');
3319         if (!u) {
3320                 log_error("/etc/passwd entry has invalid password field.");
3321                 return -EIO;
3322         }
3323
3324         u++;
3325         g = strchr(u, ':');
3326         if (!g) {
3327                 log_error("/etc/passwd entry has invalid UID field.");
3328                 return -EIO;
3329         }
3330
3331         *g = 0;
3332         g++;
3333         x = strchr(g, ':');
3334         if (!x) {
3335                 log_error("/etc/passwd entry has invalid GID field.");
3336                 return -EIO;
3337         }
3338
3339         *x = 0;
3340         h = strchr(x+1, ':');
3341         if (!h) {
3342                 log_error("/etc/passwd entry has invalid GECOS field.");
3343                 return -EIO;
3344         }
3345
3346         h++;
3347         x = strchr(h, ':');
3348         if (!x) {
3349                 log_error("/etc/passwd entry has invalid home directory field.");
3350                 return -EIO;
3351         }
3352
3353         *x = 0;
3354
3355         r = parse_uid(u, &uid);
3356         if (r < 0) {
3357                 log_error("Failed to parse UID of user.");
3358                 return -EIO;
3359         }
3360
3361         r = parse_gid(g, &gid);
3362         if (r < 0) {
3363                 log_error("Failed to parse GID of user.");
3364                 return -EIO;
3365         }
3366
3367         home = strdup(h);
3368         if (!home)
3369                 return log_oom();
3370
3371         /* Second, get group memberships */
3372         fd = spawn_getent("initgroups", arg_user, &pid);
3373         if (fd < 0)
3374                 return fd;
3375
3376         fclose(f);
3377         f = fdopen(fd, "r");
3378         if (!f)
3379                 return log_oom();
3380         fd = -1;
3381
3382         if (!fgets(line, sizeof(line), f)) {
3383                 if (!ferror(f)) {
3384                         log_error("Failed to resolve user %s.", arg_user);
3385                         return -ESRCH;
3386                 }
3387
3388                 log_error_errno(errno, "Failed to read from getent: %m");
3389                 return -errno;
3390         }
3391
3392         truncate_nl(line);
3393
3394         wait_for_terminate_and_warn("getent initgroups", pid, true);
3395
3396         /* Skip over the username and subsequent separator whitespace */
3397         x = line;
3398         x += strcspn(x, WHITESPACE);
3399         x += strspn(x, WHITESPACE);
3400
3401         FOREACH_WORD(word, l, x, state) {
3402                 char c[l+1];
3403
3404                 memcpy(c, word, l);
3405                 c[l] = 0;
3406
3407                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
3408                         return log_oom();
3409
3410                 r = parse_uid(c, &uids[n_uids++]);
3411                 if (r < 0) {
3412                         log_error("Failed to parse group data from getent.");
3413                         return -EIO;
3414                 }
3415         }
3416
3417         r = mkdir_parents(home, 0775);
3418         if (r < 0)
3419                 return log_error_errno(r, "Failed to make home root directory: %m");
3420
3421         r = mkdir_safe(home, 0755, uid, gid);
3422         if (r < 0 && r != -EEXIST)
3423                 return log_error_errno(r, "Failed to make home directory: %m");
3424
3425         fchown(STDIN_FILENO, uid, gid);
3426         fchown(STDOUT_FILENO, uid, gid);
3427         fchown(STDERR_FILENO, uid, gid);
3428
3429         if (setgroups(n_uids, uids) < 0)
3430                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
3431
3432         if (setresgid(gid, gid, gid) < 0)
3433                 return log_error_errno(errno, "setregid() failed: %m");
3434
3435         if (setresuid(uid, uid, uid) < 0)
3436                 return log_error_errno(errno, "setreuid() failed: %m");
3437
3438         if (_home) {
3439                 *_home = home;
3440                 home = NULL;
3441         }
3442
3443         return 0;
3444 }
3445
3446 /*
3447  * Return values:
3448  * < 0 : wait_for_terminate() failed to get the state of the
3449  *       container, the container was terminated by a signal, or
3450  *       failed for an unknown reason.  No change is made to the
3451  *       container argument.
3452  * > 0 : The program executed in the container terminated with an
3453  *       error.  The exit code of the program executed in the
3454  *       container is returned.  The container argument has been set
3455  *       to CONTAINER_TERMINATED.
3456  *   0 : The container is being rebooted, has been shut down or exited
3457  *       successfully.  The container argument has been set to either
3458  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
3459  *
3460  * That is, success is indicated by a return value of zero, and an
3461  * error is indicated by a non-zero value.
3462  */
3463 static int wait_for_container(pid_t pid, ContainerStatus *container) {
3464         siginfo_t status;
3465         int r;
3466
3467         r = wait_for_terminate(pid, &status);
3468         if (r < 0)
3469                 return log_warning_errno(r, "Failed to wait for container: %m");
3470
3471         switch (status.si_code) {
3472
3473         case CLD_EXITED:
3474                 if (status.si_status == 0) {
3475                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
3476
3477                 } else
3478                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
3479
3480                 *container = CONTAINER_TERMINATED;
3481                 return status.si_status;
3482
3483         case CLD_KILLED:
3484                 if (status.si_status == SIGINT) {
3485
3486                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
3487                         *container = CONTAINER_TERMINATED;
3488                         return 0;
3489
3490                 } else if (status.si_status == SIGHUP) {
3491
3492                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
3493                         *container = CONTAINER_REBOOTED;
3494                         return 0;
3495                 }
3496
3497                 /* CLD_KILLED fallthrough */
3498
3499         case CLD_DUMPED:
3500                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3501                 return -EIO;
3502
3503         default:
3504                 log_error("Container %s failed due to unknown reason.", arg_machine);
3505                 return -EIO;
3506         }
3507
3508         return r;
3509 }
3510
3511 static void nop_handler(int sig) {}
3512
3513 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
3514         pid_t pid;
3515
3516         pid = PTR_TO_UINT32(userdata);
3517         if (pid > 0) {
3518                 if (kill(pid, SIGRTMIN+3) >= 0) {
3519                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
3520                         sd_event_source_set_userdata(s, NULL);
3521                         return 0;
3522                 }
3523         }
3524
3525         sd_event_exit(sd_event_source_get_event(s), 0);
3526         return 0;
3527 }
3528
3529 static int determine_names(void) {
3530         int r;
3531
3532         if (!arg_image && !arg_directory) {
3533                 if (arg_machine) {
3534                         _cleanup_(image_unrefp) Image *i = NULL;
3535
3536                         r = image_find(arg_machine, &i);
3537                         if (r < 0)
3538                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3539                         else if (r == 0) {
3540                                 log_error("No image for machine '%s': %m", arg_machine);
3541                                 return -ENOENT;
3542                         }
3543
3544                         if (i->type == IMAGE_RAW)
3545                                 r = set_sanitized_path(&arg_image, i->path);
3546                         else
3547                                 r = set_sanitized_path(&arg_directory, i->path);
3548                         if (r < 0)
3549                                 return log_error_errno(r, "Invalid image directory: %m");
3550
3551                         arg_read_only = arg_read_only || i->read_only;
3552                 } else
3553                         arg_directory = get_current_dir_name();
3554
3555                 if (!arg_directory && !arg_machine) {
3556                         log_error("Failed to determine path, please use -D or -i.");
3557                         return -EINVAL;
3558                 }
3559         }
3560
3561         if (!arg_machine) {
3562                 if (arg_directory && path_equal(arg_directory, "/"))
3563                         arg_machine = gethostname_malloc();
3564                 else
3565                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3566
3567                 if (!arg_machine)
3568                         return log_oom();
3569
3570                 hostname_cleanup(arg_machine, false);
3571                 if (!machine_name_is_valid(arg_machine)) {
3572                         log_error("Failed to determine machine name automatically, please use -M.");
3573                         return -EINVAL;
3574                 }
3575
3576                 if (arg_ephemeral) {
3577                         char *b;
3578
3579                         /* Add a random suffix when this is an
3580                          * ephemeral machine, so that we can run many
3581                          * instances at once without manually having
3582                          * to specify -M each time. */
3583
3584                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3585                                 return log_oom();
3586
3587                         free(arg_machine);
3588                         arg_machine = b;
3589                 }
3590         }
3591
3592         return 0;
3593 }
3594
3595 int main(int argc, char *argv[]) {
3596
3597         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3598         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3599         _cleanup_close_ int master = -1, image_fd = -1;
3600         _cleanup_fdset_free_ FDSet *fds = NULL;
3601         int r, n_fd_passed, loop_nr = -1;
3602         char veth_name[IFNAMSIZ];
3603         bool secondary = false, remove_subvol = false;
3604         sigset_t mask, mask_chld;
3605         pid_t pid = 0;
3606         int ret = EXIT_SUCCESS;
3607         union in_addr_union exposed = {};
3608         _cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
3609
3610         log_parse_environment();
3611         log_open();
3612
3613         r = parse_argv(argc, argv);
3614         if (r <= 0)
3615                 goto finish;
3616
3617         r = determine_names();
3618         if (r < 0)
3619                 goto finish;
3620
3621         if (geteuid() != 0) {
3622                 log_error("Need to be root.");
3623                 r = -EPERM;
3624                 goto finish;
3625         }
3626
3627         if (sd_booted() <= 0) {
3628                 log_error("Not running on a systemd system.");
3629                 r = -EINVAL;
3630                 goto finish;
3631         }
3632
3633         log_close();
3634         n_fd_passed = sd_listen_fds(false);
3635         if (n_fd_passed > 0) {
3636                 r = fdset_new_listen_fds(&fds, false);
3637                 if (r < 0) {
3638                         log_error_errno(r, "Failed to collect file descriptors: %m");
3639                         goto finish;
3640                 }
3641         }
3642         fdset_close_others(fds);
3643         log_open();
3644
3645         if (arg_directory) {
3646                 assert(!arg_image);
3647
3648                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3649                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3650                         r = -EINVAL;
3651                         goto finish;
3652                 }
3653
3654                 if (arg_ephemeral) {
3655                         char *np;
3656
3657                         /* If the specified path is a mount point we
3658                          * generate the new snapshot immediately
3659                          * inside it under a random name. However if
3660                          * the specified is not a mount point we
3661                          * create the new snapshot in the parent
3662                          * directory, just next to it. */
3663                         r = path_is_mount_point(arg_directory, false);
3664                         if (r < 0) {
3665                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3666                                 goto finish;
3667                         }
3668                         if (r > 0)
3669                                 r = tempfn_random_child(arg_directory, &np);
3670                         else
3671                                 r = tempfn_random(arg_directory, &np);
3672                         if (r < 0) {
3673                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3674                                 goto finish;
3675                         }
3676
3677                         r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3678                         if (r < 0) {
3679                                 log_error_errno(r, "Failed to lock %s: %m", np);
3680                                 goto finish;
3681                         }
3682
3683                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3684                         if (r < 0) {
3685                                 free(np);
3686                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3687                                 goto finish;
3688                         }
3689
3690                         free(arg_directory);
3691                         arg_directory = np;
3692
3693                         remove_subvol = true;
3694
3695                 } else {
3696                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3697                         if (r == -EBUSY) {
3698                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
3699                                 goto finish;
3700                         }
3701                         if (r < 0) {
3702                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
3703                                 return r;
3704                         }
3705
3706                         if (arg_template) {
3707                                 r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3708                                 if (r == -EEXIST) {
3709                                         if (!arg_quiet)
3710                                                 log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3711                                 } else if (r < 0) {
3712                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
3713                                         goto finish;
3714                                 } else {
3715                                         if (!arg_quiet)
3716                                                 log_info("Populated %s from template %s.", arg_directory, arg_template);
3717                                 }
3718                         }
3719                 }
3720
3721                 if (arg_boot) {
3722                         if (path_is_os_tree(arg_directory) <= 0) {
3723                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3724                                 r = -EINVAL;
3725                                 goto finish;
3726                         }
3727                 } else {
3728                         const char *p;
3729
3730                         p = strjoina(arg_directory,
3731                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3732                         if (access(p, F_OK) < 0) {
3733                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3734                                 r = -EINVAL;
3735                                 goto finish;
3736                         }
3737                 }
3738
3739         } else {
3740                 char template[] = "/tmp/nspawn-root-XXXXXX";
3741
3742                 assert(arg_image);
3743                 assert(!arg_template);
3744
3745                 r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
3746                 if (r == -EBUSY) {
3747                         r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
3748                         goto finish;
3749                 }
3750                 if (r < 0) {
3751                         r = log_error_errno(r, "Failed to create image lock: %m");
3752                         goto finish;
3753                 }
3754
3755                 if (!mkdtemp(template)) {
3756                         log_error_errno(errno, "Failed to create temporary directory: %m");
3757                         r = -errno;
3758                         goto finish;
3759                 }
3760
3761                 arg_directory = strdup(template);
3762                 if (!arg_directory) {
3763                         r = log_oom();
3764                         goto finish;
3765                 }
3766
3767                 image_fd = setup_image(&device_path, &loop_nr);
3768                 if (image_fd < 0) {
3769                         r = image_fd;
3770                         goto finish;
3771                 }
3772
3773                 r = dissect_image(image_fd,
3774                                   &root_device, &root_device_rw,
3775                                   &home_device, &home_device_rw,
3776                                   &srv_device, &srv_device_rw,
3777                                   &secondary);
3778                 if (r < 0)
3779                         goto finish;
3780         }
3781
3782         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3783         if (master < 0) {
3784                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3785                 goto finish;
3786         }
3787
3788         r = ptsname_malloc(master, &console);
3789         if (r < 0) {
3790                 r = log_error_errno(r, "Failed to determine tty name: %m");
3791                 goto finish;
3792         }
3793
3794         if (!arg_quiet)
3795                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3796                          arg_machine, arg_image ?: arg_directory);
3797
3798         if (unlockpt(master) < 0) {
3799                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3800                 goto finish;
3801         }
3802
3803         assert_se(sigemptyset(&mask) == 0);
3804         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3805         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3806
3807         assert_se(sigemptyset(&mask_chld) == 0);
3808         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3809
3810         for (;;) {
3811                 _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 }, rtnl_socket_pair[2] = { -1, -1 };
3812                 ContainerStatus container_status;
3813                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3814                 struct sigaction sa = {
3815                         .sa_handler = nop_handler,
3816                         .sa_flags = SA_NOCLDSTOP,
3817                 };
3818
3819                 r = barrier_create(&barrier);
3820                 if (r < 0) {
3821                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3822                         goto finish;
3823                 }
3824
3825                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3826                         r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3827                         goto finish;
3828                 }
3829
3830                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0) {
3831                         r = log_error_errno(errno, "Failed to create rtnl socket pair: %m");
3832                         goto finish;
3833                 }
3834
3835                 /* Child can be killed before execv(), so handle SIGCHLD
3836                  * in order to interrupt parent's blocking calls and
3837                  * give it a chance to call wait() and terminate. */
3838                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3839                 if (r < 0) {
3840                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3841                         goto finish;
3842                 }
3843
3844                 r = sigaction(SIGCHLD, &sa, NULL);
3845                 if (r < 0) {
3846                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3847                         goto finish;
3848                 }
3849
3850                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3851                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3852                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3853                 if (pid < 0) {
3854                         if (errno == EINVAL)
3855                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3856                         else
3857                                 r = log_error_errno(errno, "clone() failed: %m");
3858
3859                         goto finish;
3860                 }
3861
3862                 if (pid == 0) {
3863                         /* child */
3864                         _cleanup_free_ char *home = NULL;
3865                         unsigned n_env = 2;
3866                         const char *envp[] = {
3867                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3868                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3869                                 NULL, /* TERM */
3870                                 NULL, /* HOME */
3871                                 NULL, /* USER */
3872                                 NULL, /* LOGNAME */
3873                                 NULL, /* container_uuid */
3874                                 NULL, /* LISTEN_FDS */
3875                                 NULL, /* LISTEN_PID */
3876                                 NULL
3877                         };
3878                         char **env_use;
3879
3880                         barrier_set_role(&barrier, BARRIER_CHILD);
3881
3882                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3883                         if (envp[n_env])
3884                                 n_env ++;
3885
3886                         master = safe_close(master);
3887
3888                         close_nointr(STDIN_FILENO);
3889                         close_nointr(STDOUT_FILENO);
3890                         close_nointr(STDERR_FILENO);
3891
3892                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3893                         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
3894
3895                         reset_all_signal_handlers();
3896                         reset_signal_mask();
3897
3898                         r = open_terminal(console, O_RDWR);
3899                         if (r != STDIN_FILENO) {
3900                                 if (r >= 0) {
3901                                         safe_close(r);
3902                                         r = -EINVAL;
3903                                 }
3904
3905                                 log_error_errno(r, "Failed to open console: %m");
3906                                 _exit(EXIT_FAILURE);
3907                         }
3908
3909                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3910                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3911                                 log_error_errno(errno, "Failed to duplicate console: %m");
3912                                 _exit(EXIT_FAILURE);
3913                         }
3914
3915                         if (setsid() < 0) {
3916                                 log_error_errno(errno, "setsid() failed: %m");
3917                                 _exit(EXIT_FAILURE);
3918                         }
3919
3920                         if (reset_audit_loginuid() < 0)
3921                                 _exit(EXIT_FAILURE);
3922
3923                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3924                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3925                                 _exit(EXIT_FAILURE);
3926                         }
3927
3928                         /* Mark everything as slave, so that we still
3929                          * receive mounts from the real root, but don't
3930                          * propagate mounts to the real root. */
3931                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3932                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3933                                 _exit(EXIT_FAILURE);
3934                         }
3935
3936                         if (mount_devices(arg_directory,
3937                                           root_device, root_device_rw,
3938                                           home_device, home_device_rw,
3939                                           srv_device, srv_device_rw) < 0)
3940                                 _exit(EXIT_FAILURE);
3941
3942                         /* Turn directory into bind mount */
3943                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3944                                 log_error_errno(errno, "Failed to make bind mount: %m");
3945                                 _exit(EXIT_FAILURE);
3946                         }
3947
3948                         r = setup_volatile(arg_directory);
3949                         if (r < 0)
3950                                 _exit(EXIT_FAILURE);
3951
3952                         if (setup_volatile_state(arg_directory) < 0)
3953                                 _exit(EXIT_FAILURE);
3954
3955                         r = base_filesystem_create(arg_directory);
3956                         if (r < 0)
3957                                 _exit(EXIT_FAILURE);
3958
3959                         if (arg_read_only) {
3960                                 r = bind_remount_recursive(arg_directory, true);
3961                                 if (r < 0) {
3962                                         log_error_errno(r, "Failed to make tree read-only: %m");
3963                                         _exit(EXIT_FAILURE);
3964                                 }
3965                         }
3966
3967                         if (mount_all(arg_directory) < 0)
3968                                 _exit(EXIT_FAILURE);
3969
3970                         if (copy_devnodes(arg_directory) < 0)
3971                                 _exit(EXIT_FAILURE);
3972
3973                         if (setup_ptmx(arg_directory) < 0)
3974                                 _exit(EXIT_FAILURE);
3975
3976                         dev_setup(arg_directory);
3977
3978                         if (setup_propagate(arg_directory) < 0)
3979                                 _exit(EXIT_FAILURE);
3980
3981                         if (setup_seccomp() < 0)
3982                                 _exit(EXIT_FAILURE);
3983
3984                         if (setup_dev_console(arg_directory, console) < 0)
3985                                 _exit(EXIT_FAILURE);
3986
3987                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3988                                 _exit(EXIT_FAILURE);
3989                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3990
3991                         if (send_rtnl(rtnl_socket_pair[1]) < 0)
3992                                 _exit(EXIT_FAILURE);
3993                         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
3994
3995                         /* Tell the parent that we are ready, and that
3996                          * it can cgroupify us to that we lack access
3997                          * to certain devices and resources. */
3998                         (void) barrier_place(&barrier);
3999
4000                         if (setup_boot_id(arg_directory) < 0)
4001                                 _exit(EXIT_FAILURE);
4002
4003                         if (setup_timezone(arg_directory) < 0)
4004                                 _exit(EXIT_FAILURE);
4005
4006                         if (setup_resolv_conf(arg_directory) < 0)
4007                                 _exit(EXIT_FAILURE);
4008
4009                         if (setup_journal(arg_directory) < 0)
4010                                 _exit(EXIT_FAILURE);
4011
4012                         if (mount_binds(arg_directory, arg_bind, false) < 0)
4013                                 _exit(EXIT_FAILURE);
4014
4015                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
4016                                 _exit(EXIT_FAILURE);
4017
4018                         if (mount_tmpfs(arg_directory) < 0)
4019                                 _exit(EXIT_FAILURE);
4020
4021                         /* Wait until we are cgroup-ified, so that we
4022                          * can mount the right cgroup path writable */
4023                         (void) barrier_sync_next(&barrier);
4024
4025                         if (mount_cgroup(arg_directory) < 0)
4026                                 _exit(EXIT_FAILURE);
4027
4028                         if (chdir(arg_directory) < 0) {
4029                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
4030                                 _exit(EXIT_FAILURE);
4031                         }
4032
4033                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
4034                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
4035                                 _exit(EXIT_FAILURE);
4036                         }
4037
4038                         if (chroot(".") < 0) {
4039                                 log_error_errno(errno, "chroot() failed: %m");
4040                                 _exit(EXIT_FAILURE);
4041                         }
4042
4043                         if (chdir("/") < 0) {
4044                                 log_error_errno(errno, "chdir() failed: %m");
4045                                 _exit(EXIT_FAILURE);
4046                         }
4047
4048                         umask(0022);
4049
4050                         if (arg_private_network)
4051                                 loopback_setup();
4052
4053                         if (drop_capabilities() < 0) {
4054                                 log_error_errno(errno, "drop_capabilities() failed: %m");
4055                                 _exit(EXIT_FAILURE);
4056                         }
4057
4058                         r = change_uid_gid(&home);
4059                         if (r < 0)
4060                                 _exit(EXIT_FAILURE);
4061
4062                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
4063                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
4064                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
4065                                 log_oom();
4066                                 _exit(EXIT_FAILURE);
4067                         }
4068
4069                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
4070                                 char as_uuid[37];
4071
4072                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
4073                                         log_oom();
4074                                         _exit(EXIT_FAILURE);
4075                                 }
4076                         }
4077
4078                         if (fdset_size(fds) > 0) {
4079                                 r = fdset_cloexec(fds, false);
4080                                 if (r < 0) {
4081                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
4082                                         _exit(EXIT_FAILURE);
4083                                 }
4084
4085                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
4086                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
4087                                         log_oom();
4088                                         _exit(EXIT_FAILURE);
4089                                 }
4090                         }
4091
4092                         setup_hostname();
4093
4094                         if (arg_personality != 0xffffffffLU) {
4095                                 if (personality(arg_personality) < 0) {
4096                                         log_error_errno(errno, "personality() failed: %m");
4097                                         _exit(EXIT_FAILURE);
4098                                 }
4099                         } else if (secondary) {
4100                                 if (personality(PER_LINUX32) < 0) {
4101                                         log_error_errno(errno, "personality() failed: %m");
4102                                         _exit(EXIT_FAILURE);
4103                                 }
4104                         }
4105
4106 #ifdef HAVE_SELINUX
4107                         if (arg_selinux_context)
4108                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
4109                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
4110                                         _exit(EXIT_FAILURE);
4111                                 }
4112 #endif
4113
4114                         if (!strv_isempty(arg_setenv)) {
4115                                 char **n;
4116
4117                                 n = strv_env_merge(2, envp, arg_setenv);
4118                                 if (!n) {
4119                                         log_oom();
4120                                         _exit(EXIT_FAILURE);
4121                                 }
4122
4123                                 env_use = n;
4124                         } else
4125                                 env_use = (char**) envp;
4126
4127                         /* Wait until the parent is ready with the setup, too... */
4128                         if (!barrier_place_and_sync(&barrier))
4129                                 _exit(EXIT_FAILURE);
4130
4131                         if (arg_boot) {
4132                                 char **a;
4133                                 size_t l;
4134
4135                                 /* Automatically search for the init system */
4136
4137                                 l = 1 + argc - optind;
4138                                 a = newa(char*, l + 1);
4139                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
4140
4141                                 a[0] = (char*) "/usr/lib/systemd/systemd";
4142                                 execve(a[0], a, env_use);
4143
4144                                 a[0] = (char*) "/lib/systemd/systemd";
4145                                 execve(a[0], a, env_use);
4146
4147                                 a[0] = (char*) "/sbin/init";
4148                                 execve(a[0], a, env_use);
4149                         } else if (argc > optind)
4150                                 execvpe(argv[optind], argv + optind, env_use);
4151                         else {
4152                                 chdir(home ? home : "/root");
4153                                 execle("/bin/bash", "-bash", NULL, env_use);
4154                                 execle("/bin/sh", "-sh", NULL, env_use);
4155                         }
4156
4157                         log_error_errno(errno, "execv() failed: %m");
4158                         _exit(EXIT_FAILURE);
4159                 }
4160
4161                 barrier_set_role(&barrier, BARRIER_PARENT);
4162                 fdset_free(fds);
4163                 fds = NULL;
4164
4165                 kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4166                 rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4167
4168                 /* Wait for the most basic Child-setup to be done,
4169                  * before we add hardware to it, and place it in a
4170                  * cgroup. */
4171                 if (barrier_sync_next(&barrier)) {
4172                         int ifi = 0;
4173
4174                         r = move_network_interfaces(pid);
4175                         if (r < 0)
4176                                 goto finish;
4177
4178                         r = setup_veth(pid, veth_name, &ifi);
4179                         if (r < 0)
4180                                 goto finish;
4181
4182                         r = setup_bridge(veth_name, &ifi);
4183                         if (r < 0)
4184                                 goto finish;
4185
4186                         r = setup_macvlan(pid);
4187                         if (r < 0)
4188                                 goto finish;
4189
4190                         r = setup_ipvlan(pid);
4191                         if (r < 0)
4192                                 goto finish;
4193
4194                         r = register_machine(pid, ifi);
4195                         if (r < 0)
4196                                 goto finish;
4197
4198                         /* Block SIGCHLD here, before notifying child.
4199                          * process_pty() will handle it with the other signals. */
4200                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
4201                         if (r < 0)
4202                                 goto finish;
4203
4204                         /* Reset signal to default */
4205                         r = default_signals(SIGCHLD, -1);
4206                         if (r < 0)
4207                                 goto finish;
4208
4209                         /* Notify the child that the parent is ready with all
4210                          * its setup, and that the child can now hand over
4211                          * control to the code to run inside the container. */
4212                         (void) barrier_place(&barrier);
4213
4214                         /* And wait that the child is completely ready now. */
4215                         if (barrier_place_and_sync(&barrier)) {
4216                                 _cleanup_event_unref_ sd_event *event = NULL;
4217                                 _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4218                                 _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
4219                                 char last_char = 0;
4220
4221                                 sd_notifyf(false,
4222                                            "READY=1\n"
4223                                            "STATUS=Container running.\n"
4224                                            "X_NSPAWN_LEADER_PID=" PID_FMT, pid);
4225
4226                                 r = sd_event_new(&event);
4227                                 if (r < 0) {
4228                                         log_error_errno(r, "Failed to get default event source: %m");
4229                                         goto finish;
4230                                 }
4231
4232                                 if (arg_boot) {
4233                                         /* Try to kill the init system on SIGINT or SIGTERM */
4234                                         sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
4235                                         sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
4236                                 } else {
4237                                         /* Immediately exit */
4238                                         sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
4239                                         sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
4240                                 }
4241
4242                                 /* simply exit on sigchld */
4243                                 sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
4244
4245                                 if (arg_expose_ports) {
4246                                         r = watch_rtnl(event, rtnl_socket_pair[0], &exposed, &rtnl);
4247                                         if (r < 0)
4248                                                 goto finish;
4249
4250                                         (void) expose_ports(rtnl, &exposed);
4251                                 }
4252
4253                                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4254
4255                                 r = pty_forward_new(event, master, true, &forward);
4256                                 if (r < 0) {
4257                                         log_error_errno(r, "Failed to create PTY forwarder: %m");
4258                                         goto finish;
4259                                 }
4260
4261                                 r = sd_event_loop(event);
4262                                 if (r < 0) {
4263                                         log_error_errno(r, "Failed to run event loop: %m");
4264                                         goto finish;
4265                                 }
4266
4267                                 pty_forward_get_last_char(forward, &last_char);
4268
4269                                 forward = pty_forward_free(forward);
4270
4271                                 if (!arg_quiet && last_char != '\n')
4272                                         putc('\n', stdout);
4273
4274                                 /* Kill if it is not dead yet anyway */
4275                                 terminate_machine(pid);
4276                         }
4277                 }
4278
4279                 /* Normally redundant, but better safe than sorry */
4280                 kill(pid, SIGKILL);
4281
4282                 r = wait_for_container(pid, &container_status);
4283                 pid = 0;
4284
4285                 if (r < 0)
4286                         /* We failed to wait for the container, or the
4287                          * container exited abnormally */
4288                         goto finish;
4289                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
4290                         /* The container exited with a non-zero
4291                          * status, or with zero status and no reboot
4292                          * was requested. */
4293                         ret = r;
4294                         break;
4295                 }
4296
4297                 /* CONTAINER_REBOOTED, loop again */
4298
4299                 if (arg_keep_unit) {
4300                         /* Special handling if we are running as a
4301                          * service: instead of simply restarting the
4302                          * machine we want to restart the entire
4303                          * service, so let's inform systemd about this
4304                          * with the special exit code 133. The service
4305                          * file uses RestartForceExitStatus=133 so
4306                          * that this results in a full nspawn
4307                          * restart. This is necessary since we might
4308                          * have cgroup parameters set we want to have
4309                          * flushed out. */
4310                         ret = 133;
4311                         r = 0;
4312                         break;
4313                 }
4314
4315                 flush_ports(&exposed);
4316         }
4317
4318 finish:
4319         sd_notify(false,
4320                   "STOPPING=1\n"
4321                   "STATUS=Terminating...");
4322
4323         loop_remove(loop_nr, &image_fd);
4324
4325         if (pid > 0)
4326                 kill(pid, SIGKILL);
4327
4328         if (remove_subvol && arg_directory) {
4329                 int k;
4330
4331                 k = btrfs_subvol_remove(arg_directory);
4332                 if (k < 0)
4333                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
4334         }
4335
4336         if (arg_machine) {
4337                 const char *p;
4338
4339                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
4340                 (void) rm_rf(p, false, true, false);
4341         }
4342
4343         free(arg_directory);
4344         free(arg_template);
4345         free(arg_image);
4346         free(arg_machine);
4347         free(arg_user);
4348         strv_free(arg_setenv);
4349         strv_free(arg_network_interfaces);
4350         strv_free(arg_network_macvlan);
4351         strv_free(arg_network_ipvlan);
4352         strv_free(arg_bind);
4353         strv_free(arg_bind_ro);
4354         strv_free(arg_tmpfs);
4355
4356         flush_ports(&exposed);
4357
4358         while (arg_expose_ports) {
4359                 ExposePort *p = arg_expose_ports;
4360                 LIST_REMOVE(ports, arg_expose_ports, p);
4361                 free(p);
4362         }
4363
4364         return r < 0 ? EXIT_FAILURE : ret;
4365 }