chiark / gitweb /
nspawn: use the same image discovery logic in nspawn as in machined
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #ifdef HAVE_BLKID
56 #include <blkid/blkid.h>
57 #endif
58
59 #include "sd-daemon.h"
60 #include "sd-bus.h"
61 #include "sd-id128.h"
62 #include "sd-rtnl.h"
63 #include "log.h"
64 #include "util.h"
65 #include "mkdir.h"
66 #include "macro.h"
67 #include "audit.h"
68 #include "missing.h"
69 #include "cgroup-util.h"
70 #include "strv.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
74 #include "fdset.h"
75 #include "build.h"
76 #include "fileio.h"
77 #include "bus-util.h"
78 #include "bus-error.h"
79 #include "ptyfwd.h"
80 #include "bus-kernel.h"
81 #include "env-util.h"
82 #include "def.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
86 #include "gpt.h"
87 #include "siphash24.h"
88 #include "copy.h"
89 #include "base-filesystem.h"
90 #include "barrier.h"
91 #include "event-util.h"
92 #include "capability.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
96
97 #ifdef HAVE_SECCOMP
98 #include "seccomp-util.h"
99 #endif
100
101 typedef enum ContainerStatus {
102         CONTAINER_TERMINATED,
103         CONTAINER_REBOOTED
104 } ContainerStatus;
105
106 typedef enum LinkJournal {
107         LINK_NO,
108         LINK_AUTO,
109         LINK_HOST,
110         LINK_GUEST
111 } LinkJournal;
112
113 typedef enum Volatile {
114         VOLATILE_NO,
115         VOLATILE_YES,
116         VOLATILE_STATE,
117 } Volatile;
118
119 static char *arg_directory = NULL;
120 static char *arg_template = NULL;
121 static char *arg_user = NULL;
122 static sd_id128_t arg_uuid = {};
123 static char *arg_machine = NULL;
124 static const char *arg_selinux_context = NULL;
125 static const char *arg_selinux_apifs_context = NULL;
126 static const char *arg_slice = NULL;
127 static bool arg_private_network = false;
128 static bool arg_read_only = false;
129 static bool arg_boot = false;
130 static bool arg_ephemeral = false;
131 static LinkJournal arg_link_journal = LINK_AUTO;
132 static bool arg_link_journal_try = false;
133 static uint64_t arg_retain =
134         (1ULL << CAP_CHOWN) |
135         (1ULL << CAP_DAC_OVERRIDE) |
136         (1ULL << CAP_DAC_READ_SEARCH) |
137         (1ULL << CAP_FOWNER) |
138         (1ULL << CAP_FSETID) |
139         (1ULL << CAP_IPC_OWNER) |
140         (1ULL << CAP_KILL) |
141         (1ULL << CAP_LEASE) |
142         (1ULL << CAP_LINUX_IMMUTABLE) |
143         (1ULL << CAP_NET_BIND_SERVICE) |
144         (1ULL << CAP_NET_BROADCAST) |
145         (1ULL << CAP_NET_RAW) |
146         (1ULL << CAP_SETGID) |
147         (1ULL << CAP_SETFCAP) |
148         (1ULL << CAP_SETPCAP) |
149         (1ULL << CAP_SETUID) |
150         (1ULL << CAP_SYS_ADMIN) |
151         (1ULL << CAP_SYS_CHROOT) |
152         (1ULL << CAP_SYS_NICE) |
153         (1ULL << CAP_SYS_PTRACE) |
154         (1ULL << CAP_SYS_TTY_CONFIG) |
155         (1ULL << CAP_SYS_RESOURCE) |
156         (1ULL << CAP_SYS_BOOT) |
157         (1ULL << CAP_AUDIT_WRITE) |
158         (1ULL << CAP_AUDIT_CONTROL) |
159         (1ULL << CAP_MKNOD);
160 static char **arg_bind = NULL;
161 static char **arg_bind_ro = NULL;
162 static char **arg_tmpfs = NULL;
163 static char **arg_setenv = NULL;
164 static bool arg_quiet = false;
165 static bool arg_share_system = false;
166 static bool arg_register = true;
167 static bool arg_keep_unit = false;
168 static char **arg_network_interfaces = NULL;
169 static char **arg_network_macvlan = NULL;
170 static bool arg_network_veth = false;
171 static const char *arg_network_bridge = NULL;
172 static unsigned long arg_personality = 0xffffffffLU;
173 static char *arg_image = NULL;
174 static Volatile arg_volatile = VOLATILE_NO;
175
176 static void help(void) {
177         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
178                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
179                "  -h --help                 Show this help\n"
180                "     --version              Print version string\n"
181                "  -q --quiet                Do not show status information\n"
182                "  -D --directory=PATH       Root directory for the container\n"
183                "     --template=PATH        Initialize root directory from template directory,\n"
184                "                            if missing\n"
185                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
186                "                            remove it after exit\n"
187                "  -i --image=PATH           File system device or disk image for the container\n"
188                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
189                "  -u --user=USER            Run the command under specified user or uid\n"
190                "  -M --machine=NAME         Set the machine name for the container\n"
191                "     --uuid=UUID            Set a specific machine UUID for the container\n"
192                "  -S --slice=SLICE          Place the container in the specified slice\n"
193                "     --private-network      Disable network in container\n"
194                "     --network-interface=INTERFACE\n"
195                "                            Assign an existing network interface to the\n"
196                "                            container\n"
197                "     --network-macvlan=INTERFACE\n"
198                "                            Create a macvlan network interface based on an\n"
199                "                            existing network interface to the container\n"
200                "     --network-veth         Add a virtual ethernet connection between host\n"
201                "                            and container\n"
202                "     --network-bridge=INTERFACE\n"
203                "                            Add a virtual ethernet connection between host\n"
204                "                            and container and add it to an existing bridge on\n"
205                "                            the host\n"
206                "  -Z --selinux-context=SECLABEL\n"
207                "                            Set the SELinux security context to be used by\n"
208                "                            processes in the container\n"
209                "  -L --selinux-apifs-context=SECLABEL\n"
210                "                            Set the SELinux security context to be used by\n"
211                "                            API/tmpfs file systems in the container\n"
212                "     --capability=CAP       In addition to the default, retain specified\n"
213                "                            capability\n"
214                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
215                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
216                "                            try-guest, try-host\n"
217                "  -j                        Equivalent to --link-journal=try-guest\n"
218                "     --read-only            Mount the root directory read-only\n"
219                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
220                "                            the container\n"
221                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
222                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
223                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
224                "     --share-system         Share system namespaces with host\n"
225                "     --register=BOOLEAN     Register container as machine\n"
226                "     --keep-unit            Do not register a scope for the machine, reuse\n"
227                "                            the service unit nspawn is running in\n"
228                "     --volatile[=MODE]      Run the system in volatile mode\n",
229                program_invocation_short_name);
230 }
231
232 static int set_sanitized_path(char **b, const char *path) {
233         char *p;
234
235         assert(b);
236         assert(path);
237
238         p = canonicalize_file_name(path);
239         if (!p) {
240                 if (errno != ENOENT)
241                         return -errno;
242
243                 p = path_make_absolute_cwd(path);
244                 if (!p)
245                         return -ENOMEM;
246         }
247
248         free(*b);
249         *b = path_kill_slashes(p);
250         return 0;
251 }
252
253 static int parse_argv(int argc, char *argv[]) {
254
255         enum {
256                 ARG_VERSION = 0x100,
257                 ARG_PRIVATE_NETWORK,
258                 ARG_UUID,
259                 ARG_READ_ONLY,
260                 ARG_CAPABILITY,
261                 ARG_DROP_CAPABILITY,
262                 ARG_LINK_JOURNAL,
263                 ARG_BIND,
264                 ARG_BIND_RO,
265                 ARG_TMPFS,
266                 ARG_SETENV,
267                 ARG_SHARE_SYSTEM,
268                 ARG_REGISTER,
269                 ARG_KEEP_UNIT,
270                 ARG_NETWORK_INTERFACE,
271                 ARG_NETWORK_MACVLAN,
272                 ARG_NETWORK_VETH,
273                 ARG_NETWORK_BRIDGE,
274                 ARG_PERSONALITY,
275                 ARG_VOLATILE,
276                 ARG_TEMPLATE,
277         };
278
279         static const struct option options[] = {
280                 { "help",                  no_argument,       NULL, 'h'                   },
281                 { "version",               no_argument,       NULL, ARG_VERSION           },
282                 { "directory",             required_argument, NULL, 'D'                   },
283                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
284                 { "ephemeral",             no_argument,       NULL, 'x'                   },
285                 { "user",                  required_argument, NULL, 'u'                   },
286                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
287                 { "boot",                  no_argument,       NULL, 'b'                   },
288                 { "uuid",                  required_argument, NULL, ARG_UUID              },
289                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
290                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
291                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
292                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
293                 { "bind",                  required_argument, NULL, ARG_BIND              },
294                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
295                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
296                 { "machine",               required_argument, NULL, 'M'                   },
297                 { "slice",                 required_argument, NULL, 'S'                   },
298                 { "setenv",                required_argument, NULL, ARG_SETENV            },
299                 { "selinux-context",       required_argument, NULL, 'Z'                   },
300                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
301                 { "quiet",                 no_argument,       NULL, 'q'                   },
302                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
303                 { "register",              required_argument, NULL, ARG_REGISTER          },
304                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
305                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
306                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
307                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
308                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
309                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
310                 { "image",                 required_argument, NULL, 'i'                   },
311                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
312                 {}
313         };
314
315         int c, r;
316         uint64_t plus = 0, minus = 0;
317
318         assert(argc >= 0);
319         assert(argv);
320
321         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
322
323                 switch (c) {
324
325                 case 'h':
326                         help();
327                         return 0;
328
329                 case ARG_VERSION:
330                         puts(PACKAGE_STRING);
331                         puts(SYSTEMD_FEATURES);
332                         return 0;
333
334                 case 'D':
335                         r = set_sanitized_path(&arg_directory, optarg);
336                         if (r < 0)
337                                 return log_error_errno(r, "Invalid root directory: %m");
338
339                         break;
340
341                 case ARG_TEMPLATE:
342                         r = set_sanitized_path(&arg_template, optarg);
343                         if (r < 0)
344                                 return log_error_errno(r, "Invalid template directory: %m");
345
346                         break;
347
348                 case 'i':
349                         r = set_sanitized_path(&arg_image, optarg);
350                         if (r < 0)
351                                 return log_error_errno(r, "Invalid image path: %m");
352
353                         break;
354
355                 case 'x':
356                         arg_ephemeral = true;
357                         break;
358
359                 case 'u':
360                         free(arg_user);
361                         arg_user = strdup(optarg);
362                         if (!arg_user)
363                                 return log_oom();
364
365                         break;
366
367                 case ARG_NETWORK_BRIDGE:
368                         arg_network_bridge = optarg;
369
370                         /* fall through */
371
372                 case ARG_NETWORK_VETH:
373                         arg_network_veth = true;
374                         arg_private_network = true;
375                         break;
376
377                 case ARG_NETWORK_INTERFACE:
378                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
379                                 return log_oom();
380
381                         arg_private_network = true;
382                         break;
383
384                 case ARG_NETWORK_MACVLAN:
385                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
386                                 return log_oom();
387
388                         /* fall through */
389
390                 case ARG_PRIVATE_NETWORK:
391                         arg_private_network = true;
392                         break;
393
394                 case 'b':
395                         arg_boot = true;
396                         break;
397
398                 case ARG_UUID:
399                         r = sd_id128_from_string(optarg, &arg_uuid);
400                         if (r < 0) {
401                                 log_error("Invalid UUID: %s", optarg);
402                                 return r;
403                         }
404                         break;
405
406                 case 'S':
407                         arg_slice = optarg;
408                         break;
409
410                 case 'M':
411                         if (isempty(optarg)) {
412                                 free(arg_machine);
413                                 arg_machine = NULL;
414                         } else {
415                                 if (!machine_name_is_valid(optarg)) {
416                                         log_error("Invalid machine name: %s", optarg);
417                                         return -EINVAL;
418                                 }
419
420                                 r = free_and_strdup(&arg_machine, optarg);
421                                 if (r < 0)
422                                         return log_oom();
423
424                                 break;
425                         }
426
427                 case 'Z':
428                         arg_selinux_context = optarg;
429                         break;
430
431                 case 'L':
432                         arg_selinux_apifs_context = optarg;
433                         break;
434
435                 case ARG_READ_ONLY:
436                         arg_read_only = true;
437                         break;
438
439                 case ARG_CAPABILITY:
440                 case ARG_DROP_CAPABILITY: {
441                         const char *state, *word;
442                         size_t length;
443
444                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
445                                 _cleanup_free_ char *t;
446
447                                 t = strndup(word, length);
448                                 if (!t)
449                                         return log_oom();
450
451                                 if (streq(t, "all")) {
452                                         if (c == ARG_CAPABILITY)
453                                                 plus = (uint64_t) -1;
454                                         else
455                                                 minus = (uint64_t) -1;
456                                 } else {
457                                         int cap;
458
459                                         cap = capability_from_name(t);
460                                         if (cap < 0) {
461                                                 log_error("Failed to parse capability %s.", t);
462                                                 return -EINVAL;
463                                         }
464
465                                         if (c == ARG_CAPABILITY)
466                                                 plus |= 1ULL << (uint64_t) cap;
467                                         else
468                                                 minus |= 1ULL << (uint64_t) cap;
469                                 }
470                         }
471
472                         break;
473                 }
474
475                 case 'j':
476                         arg_link_journal = LINK_GUEST;
477                         arg_link_journal_try = true;
478                         break;
479
480                 case ARG_LINK_JOURNAL:
481                         if (streq(optarg, "auto")) {
482                                 arg_link_journal = LINK_AUTO;
483                                 arg_link_journal_try = false;
484                         } else if (streq(optarg, "no")) {
485                                 arg_link_journal = LINK_NO;
486                                 arg_link_journal_try = false;
487                         } else if (streq(optarg, "guest")) {
488                                 arg_link_journal = LINK_GUEST;
489                                 arg_link_journal_try = false;
490                         } else if (streq(optarg, "host")) {
491                                 arg_link_journal = LINK_HOST;
492                                 arg_link_journal_try = false;
493                         } else if (streq(optarg, "try-guest")) {
494                                 arg_link_journal = LINK_GUEST;
495                                 arg_link_journal_try = true;
496                         } else if (streq(optarg, "try-host")) {
497                                 arg_link_journal = LINK_HOST;
498                                 arg_link_journal_try = true;
499                         } else {
500                                 log_error("Failed to parse link journal mode %s", optarg);
501                                 return -EINVAL;
502                         }
503
504                         break;
505
506                 case ARG_BIND:
507                 case ARG_BIND_RO: {
508                         _cleanup_free_ char *a = NULL, *b = NULL;
509                         char *e;
510                         char ***x;
511
512                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
513
514                         e = strchr(optarg, ':');
515                         if (e) {
516                                 a = strndup(optarg, e - optarg);
517                                 b = strdup(e + 1);
518                         } else {
519                                 a = strdup(optarg);
520                                 b = strdup(optarg);
521                         }
522
523                         if (!a || !b)
524                                 return log_oom();
525
526                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
527                                 log_error("Invalid bind mount specification: %s", optarg);
528                                 return -EINVAL;
529                         }
530
531                         r = strv_extend(x, a);
532                         if (r < 0)
533                                 return log_oom();
534
535                         r = strv_extend(x, b);
536                         if (r < 0)
537                                 return log_oom();
538
539                         break;
540                 }
541
542                 case ARG_TMPFS: {
543                         _cleanup_free_ char *a = NULL, *b = NULL;
544                         char *e;
545
546                         e = strchr(optarg, ':');
547                         if (e) {
548                                 a = strndup(optarg, e - optarg);
549                                 b = strdup(e + 1);
550                         } else {
551                                 a = strdup(optarg);
552                                 b = strdup("mode=0755");
553                         }
554
555                         if (!a || !b)
556                                 return log_oom();
557
558                         if (!path_is_absolute(a)) {
559                                 log_error("Invalid tmpfs specification: %s", optarg);
560                                 return -EINVAL;
561                         }
562
563                         r = strv_push(&arg_tmpfs, a);
564                         if (r < 0)
565                                 return log_oom();
566
567                         a = NULL;
568
569                         r = strv_push(&arg_tmpfs, b);
570                         if (r < 0)
571                                 return log_oom();
572
573                         b = NULL;
574
575                         break;
576                 }
577
578                 case ARG_SETENV: {
579                         char **n;
580
581                         if (!env_assignment_is_valid(optarg)) {
582                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
583                                 return -EINVAL;
584                         }
585
586                         n = strv_env_set(arg_setenv, optarg);
587                         if (!n)
588                                 return log_oom();
589
590                         strv_free(arg_setenv);
591                         arg_setenv = n;
592                         break;
593                 }
594
595                 case 'q':
596                         arg_quiet = true;
597                         break;
598
599                 case ARG_SHARE_SYSTEM:
600                         arg_share_system = true;
601                         break;
602
603                 case ARG_REGISTER:
604                         r = parse_boolean(optarg);
605                         if (r < 0) {
606                                 log_error("Failed to parse --register= argument: %s", optarg);
607                                 return r;
608                         }
609
610                         arg_register = r;
611                         break;
612
613                 case ARG_KEEP_UNIT:
614                         arg_keep_unit = true;
615                         break;
616
617                 case ARG_PERSONALITY:
618
619                         arg_personality = personality_from_string(optarg);
620                         if (arg_personality == 0xffffffffLU) {
621                                 log_error("Unknown or unsupported personality '%s'.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         break;
626
627                 case ARG_VOLATILE:
628
629                         if (!optarg)
630                                 arg_volatile = VOLATILE_YES;
631                         else {
632                                 r = parse_boolean(optarg);
633                                 if (r < 0) {
634                                         if (streq(optarg, "state"))
635                                                 arg_volatile = VOLATILE_STATE;
636                                         else {
637                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
638                                                 return r;
639                                         }
640                                 } else
641                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
642                         }
643
644                         break;
645
646                 case '?':
647                         return -EINVAL;
648
649                 default:
650                         assert_not_reached("Unhandled option");
651                 }
652
653         if (arg_share_system)
654                 arg_register = false;
655
656         if (arg_boot && arg_share_system) {
657                 log_error("--boot and --share-system may not be combined.");
658                 return -EINVAL;
659         }
660
661         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
662                 log_error("--keep-unit may not be used when invoked from a user session.");
663                 return -EINVAL;
664         }
665
666         if (arg_directory && arg_image) {
667                 log_error("--directory= and --image= may not be combined.");
668                 return -EINVAL;
669         }
670
671         if (arg_template && arg_image) {
672                 log_error("--template= and --image= may not be combined.");
673                 return -EINVAL;
674         }
675
676         if (arg_template && !(arg_directory || arg_machine)) {
677                 log_error("--template= needs --directory= or --machine=.");
678                 return -EINVAL;
679         }
680
681         if (arg_ephemeral && arg_template) {
682                 log_error("--ephemeral and --template= may not be combined.");
683                 return -EINVAL;
684         }
685
686         if (arg_ephemeral && arg_image) {
687                 log_error("--ephemeral and --image= may not be combined.");
688                 return -EINVAL;
689         }
690
691         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
692                 log_error("--ephemeral and --link-journal= may not be combined.");
693                 return -EINVAL;
694         }
695
696         if (arg_volatile != VOLATILE_NO && arg_read_only) {
697                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
698                 return -EINVAL;
699         }
700
701         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
702
703         return 1;
704 }
705
706 static int mount_all(const char *dest) {
707
708         typedef struct MountPoint {
709                 const char *what;
710                 const char *where;
711                 const char *type;
712                 const char *options;
713                 unsigned long flags;
714                 bool fatal;
715         } MountPoint;
716
717         static const MountPoint mount_table[] = {
718                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
719                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
720                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
721                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
722                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
723                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
724                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
725                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
726 #ifdef HAVE_SELINUX
727                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
728                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
729 #endif
730         };
731
732         unsigned k;
733         int r = 0;
734
735         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
736                 _cleanup_free_ char *where = NULL;
737 #ifdef HAVE_SELINUX
738                 _cleanup_free_ char *options = NULL;
739 #endif
740                 const char *o;
741                 int t;
742
743                 where = strjoin(dest, "/", mount_table[k].where, NULL);
744                 if (!where)
745                         return log_oom();
746
747                 t = path_is_mount_point(where, true);
748                 if (t < 0) {
749                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
750
751                         if (r == 0)
752                                 r = t;
753
754                         continue;
755                 }
756
757                 /* Skip this entry if it is not a remount. */
758                 if (mount_table[k].what && t > 0)
759                         continue;
760
761                 t = mkdir_p(where, 0755);
762                 if (t < 0) {
763                         if (mount_table[k].fatal) {
764                                log_error_errno(t, "Failed to create directory %s: %m", where);
765
766                                 if (r == 0)
767                                         r = t;
768                         } else
769                                log_warning_errno(t, "Failed to create directory %s: %m", where);
770
771                         continue;
772                 }
773
774 #ifdef HAVE_SELINUX
775                 if (arg_selinux_apifs_context &&
776                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
777                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
778                         if (!options)
779                                 return log_oom();
780
781                         o = options;
782                 } else
783 #endif
784                         o = mount_table[k].options;
785
786
787                 if (mount(mount_table[k].what,
788                           where,
789                           mount_table[k].type,
790                           mount_table[k].flags,
791                           o) < 0) {
792
793                         if (mount_table[k].fatal) {
794                                 log_error_errno(errno, "mount(%s) failed: %m", where);
795
796                                 if (r == 0)
797                                         r = -errno;
798                         } else
799                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
800                 }
801         }
802
803         return r;
804 }
805
806 static int mount_binds(const char *dest, char **l, bool ro) {
807         char **x, **y;
808
809         STRV_FOREACH_PAIR(x, y, l) {
810                 _cleanup_free_ char *where = NULL;
811                 struct stat source_st, dest_st;
812                 int r;
813
814                 if (stat(*x, &source_st) < 0)
815                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
816
817                 where = strappend(dest, *y);
818                 if (!where)
819                         return log_oom();
820
821                 r = stat(where, &dest_st);
822                 if (r == 0) {
823                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
824                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
825                                 return -EINVAL;
826                         }
827                 } else if (errno == ENOENT) {
828                         r = mkdir_parents_label(where, 0755);
829                         if (r < 0)
830                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
831                 } else {
832                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
833                         return -errno;
834                 }
835
836                 /* Create the mount point, but be conservative -- refuse to create block
837                  * and char devices. */
838                 if (S_ISDIR(source_st.st_mode)) {
839                         r = mkdir_label(where, 0755);
840                         if (r < 0 && errno != EEXIST)
841                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
842                 } else if (S_ISFIFO(source_st.st_mode)) {
843                         r = mkfifo(where, 0644);
844                         if (r < 0 && errno != EEXIST)
845                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
846                 } else if (S_ISSOCK(source_st.st_mode)) {
847                         r = mknod(where, 0644 | S_IFSOCK, 0);
848                         if (r < 0 && errno != EEXIST)
849                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
850                 } else if (S_ISREG(source_st.st_mode)) {
851                         r = touch(where);
852                         if (r < 0)
853                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
854                 } else {
855                         log_error("Refusing to create mountpoint for file: %s", *x);
856                         return -ENOTSUP;
857                 }
858
859                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
860                         return log_error_errno(errno, "mount(%s) failed: %m", where);
861
862                 if (ro) {
863                         r = bind_remount_recursive(where, true);
864                         if (r < 0)
865                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
866                 }
867         }
868
869         return 0;
870 }
871
872 static int mount_tmpfs(const char *dest) {
873         char **i, **o;
874
875         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
876                 _cleanup_free_ char *where = NULL;
877                 int r;
878
879                 where = strappend(dest, *i);
880                 if (!where)
881                         return log_oom();
882
883                 r = mkdir_label(where, 0755);
884                 if (r < 0 && r != -EEXIST)
885                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
886
887                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
888                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
889         }
890
891         return 0;
892 }
893
894 static int setup_timezone(const char *dest) {
895         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
896         char *z, *y;
897         int r;
898
899         assert(dest);
900
901         /* Fix the timezone, if possible */
902         r = readlink_malloc("/etc/localtime", &p);
903         if (r < 0) {
904                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
905                 return 0;
906         }
907
908         z = path_startswith(p, "../usr/share/zoneinfo/");
909         if (!z)
910                 z = path_startswith(p, "/usr/share/zoneinfo/");
911         if (!z) {
912                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
913                 return 0;
914         }
915
916         where = strappend(dest, "/etc/localtime");
917         if (!where)
918                 return log_oom();
919
920         r = readlink_malloc(where, &q);
921         if (r >= 0) {
922                 y = path_startswith(q, "../usr/share/zoneinfo/");
923                 if (!y)
924                         y = path_startswith(q, "/usr/share/zoneinfo/");
925
926                 /* Already pointing to the right place? Then do nothing .. */
927                 if (y && streq(y, z))
928                         return 0;
929         }
930
931         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
932         if (!check)
933                 return log_oom();
934
935         if (access(check, F_OK) < 0) {
936                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
937                 return 0;
938         }
939
940         what = strappend("../usr/share/zoneinfo/", z);
941         if (!what)
942                 return log_oom();
943
944         r = mkdir_parents(where, 0755);
945         if (r < 0) {
946                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
947
948                 return 0;
949         }
950
951         r = unlink(where);
952         if (r < 0 && errno != ENOENT) {
953                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
954
955                 return 0;
956         }
957
958         if (symlink(what, where) < 0) {
959                 log_error_errno(errno, "Failed to correct timezone of container: %m");
960                 return 0;
961         }
962
963         return 0;
964 }
965
966 static int setup_resolv_conf(const char *dest) {
967         _cleanup_free_ char *where = NULL;
968         int r;
969
970         assert(dest);
971
972         if (arg_private_network)
973                 return 0;
974
975         /* Fix resolv.conf, if possible */
976         where = strappend(dest, "/etc/resolv.conf");
977         if (!where)
978                 return log_oom();
979
980         /* We don't really care for the results of this really. If it
981          * fails, it fails, but meh... */
982         r = mkdir_parents(where, 0755);
983         if (r < 0) {
984                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
985
986                 return 0;
987         }
988
989         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
990         if (r < 0) {
991                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
992
993                 return 0;
994         }
995
996         return 0;
997 }
998
999 static int setup_volatile_state(const char *directory) {
1000         const char *p;
1001         int r;
1002
1003         assert(directory);
1004
1005         if (arg_volatile != VOLATILE_STATE)
1006                 return 0;
1007
1008         /* --volatile=state means we simply overmount /var
1009            with a tmpfs, and the rest read-only. */
1010
1011         r = bind_remount_recursive(directory, true);
1012         if (r < 0)
1013                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1014
1015         p = strappenda(directory, "/var");
1016         r = mkdir(p, 0755);
1017         if (r < 0 && errno != EEXIST)
1018                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1019
1020         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1021                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1022
1023         return 0;
1024 }
1025
1026 static int setup_volatile(const char *directory) {
1027         bool tmpfs_mounted = false, bind_mounted = false;
1028         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1029         const char *f, *t;
1030         int r;
1031
1032         assert(directory);
1033
1034         if (arg_volatile != VOLATILE_YES)
1035                 return 0;
1036
1037         /* --volatile=yes means we mount a tmpfs to the root dir, and
1038            the original /usr to use inside it, and that read-only. */
1039
1040         if (!mkdtemp(template))
1041                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1042
1043         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1044                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1045                 r = -errno;
1046                 goto fail;
1047         }
1048
1049         tmpfs_mounted = true;
1050
1051         f = strappenda(directory, "/usr");
1052         t = strappenda(template, "/usr");
1053
1054         r = mkdir(t, 0755);
1055         if (r < 0 && errno != EEXIST) {
1056                 log_error_errno(errno, "Failed to create %s: %m", t);
1057                 r = -errno;
1058                 goto fail;
1059         }
1060
1061         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1062                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1063                 r = -errno;
1064                 goto fail;
1065         }
1066
1067         bind_mounted = true;
1068
1069         r = bind_remount_recursive(t, true);
1070         if (r < 0) {
1071                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1072                 goto fail;
1073         }
1074
1075         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1076                 log_error_errno(errno, "Failed to move root mount: %m");
1077                 r = -errno;
1078                 goto fail;
1079         }
1080
1081         rmdir(template);
1082
1083         return 0;
1084
1085 fail:
1086         if (bind_mounted)
1087                 umount(t);
1088         if (tmpfs_mounted)
1089                 umount(template);
1090         rmdir(template);
1091         return r;
1092 }
1093
1094 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1095
1096         snprintf(s, 37,
1097                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1098                  SD_ID128_FORMAT_VAL(id));
1099
1100         return s;
1101 }
1102
1103 static int setup_boot_id(const char *dest) {
1104         _cleanup_free_ char *from = NULL, *to = NULL;
1105         sd_id128_t rnd = {};
1106         char as_uuid[37];
1107         int r;
1108
1109         assert(dest);
1110
1111         if (arg_share_system)
1112                 return 0;
1113
1114         /* Generate a new randomized boot ID, so that each boot-up of
1115          * the container gets a new one */
1116
1117         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1118         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1119         if (!from || !to)
1120                 return log_oom();
1121
1122         r = sd_id128_randomize(&rnd);
1123         if (r < 0)
1124                 return log_error_errno(r, "Failed to generate random boot id: %m");
1125
1126         id128_format_as_uuid(rnd, as_uuid);
1127
1128         r = write_string_file(from, as_uuid);
1129         if (r < 0)
1130                 return log_error_errno(r, "Failed to write boot id: %m");
1131
1132         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1133                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1134                 r = -errno;
1135         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1136                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1137
1138         unlink(from);
1139         return r;
1140 }
1141
1142 static int copy_devnodes(const char *dest) {
1143
1144         static const char devnodes[] =
1145                 "null\0"
1146                 "zero\0"
1147                 "full\0"
1148                 "random\0"
1149                 "urandom\0"
1150                 "tty\0"
1151                 "net/tun\0";
1152
1153         const char *d;
1154         int r = 0;
1155         _cleanup_umask_ mode_t u;
1156
1157         assert(dest);
1158
1159         u = umask(0000);
1160
1161         NULSTR_FOREACH(d, devnodes) {
1162                 _cleanup_free_ char *from = NULL, *to = NULL;
1163                 struct stat st;
1164
1165                 from = strappend("/dev/", d);
1166                 to = strjoin(dest, "/dev/", d, NULL);
1167                 if (!from || !to)
1168                         return log_oom();
1169
1170                 if (stat(from, &st) < 0) {
1171
1172                         if (errno != ENOENT)
1173                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1174
1175                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1176
1177                         log_error("%s is not a char or block device, cannot copy", from);
1178                         return -EIO;
1179
1180                 } else {
1181                         r = mkdir_parents(to, 0775);
1182                         if (r < 0) {
1183                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1184                                 return -r;
1185                         }
1186
1187                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1188                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1189                 }
1190         }
1191
1192         return r;
1193 }
1194
1195 static int setup_ptmx(const char *dest) {
1196         _cleanup_free_ char *p = NULL;
1197
1198         p = strappend(dest, "/dev/ptmx");
1199         if (!p)
1200                 return log_oom();
1201
1202         if (symlink("pts/ptmx", p) < 0)
1203                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1204
1205         return 0;
1206 }
1207
1208 static int setup_dev_console(const char *dest, const char *console) {
1209         _cleanup_umask_ mode_t u;
1210         const char *to;
1211         struct stat st;
1212         int r;
1213
1214         assert(dest);
1215         assert(console);
1216
1217         u = umask(0000);
1218
1219         if (stat("/dev/null", &st) < 0)
1220                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1221
1222         r = chmod_and_chown(console, 0600, 0, 0);
1223         if (r < 0)
1224                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1225
1226         /* We need to bind mount the right tty to /dev/console since
1227          * ptys can only exist on pts file systems. To have something
1228          * to bind mount things on we create a device node first, and
1229          * use /dev/null for that since we the cgroups device policy
1230          * allows us to create that freely, while we cannot create
1231          * /dev/console. (Note that the major minor doesn't actually
1232          * matter here, since we mount it over anyway). */
1233
1234         to = strappenda(dest, "/dev/console");
1235         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1236                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1237
1238         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1239                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1240
1241         return 0;
1242 }
1243
1244 static int setup_kmsg(const char *dest, int kmsg_socket) {
1245         _cleanup_free_ char *from = NULL, *to = NULL;
1246         int r, fd, k;
1247         _cleanup_umask_ mode_t u;
1248         union {
1249                 struct cmsghdr cmsghdr;
1250                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1251         } control = {};
1252         struct msghdr mh = {
1253                 .msg_control = &control,
1254                 .msg_controllen = sizeof(control),
1255         };
1256         struct cmsghdr *cmsg;
1257
1258         assert(dest);
1259         assert(kmsg_socket >= 0);
1260
1261         u = umask(0000);
1262
1263         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1264          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1265          * on the reading side behave very similar to /proc/kmsg,
1266          * their writing side behaves differently from /dev/kmsg in
1267          * that writing blocks when nothing is reading. In order to
1268          * avoid any problems with containers deadlocking due to this
1269          * we simply make /dev/kmsg unavailable to the container. */
1270         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1271             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1272                 return log_oom();
1273
1274         if (mkfifo(from, 0600) < 0)
1275                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1276
1277         r = chmod_and_chown(from, 0600, 0, 0);
1278         if (r < 0)
1279                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1280
1281         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1282                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1283
1284         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1285         if (fd < 0)
1286                 return log_error_errno(errno, "Failed to open fifo: %m");
1287
1288         cmsg = CMSG_FIRSTHDR(&mh);
1289         cmsg->cmsg_level = SOL_SOCKET;
1290         cmsg->cmsg_type = SCM_RIGHTS;
1291         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1292         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1293
1294         mh.msg_controllen = cmsg->cmsg_len;
1295
1296         /* Store away the fd in the socket, so that it stays open as
1297          * long as we run the child */
1298         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1299         safe_close(fd);
1300
1301         if (k < 0)
1302                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1303
1304         /* And now make the FIFO unavailable as /dev/kmsg... */
1305         unlink(from);
1306         return 0;
1307 }
1308
1309 static int setup_hostname(void) {
1310
1311         if (arg_share_system)
1312                 return 0;
1313
1314         if (sethostname_idempotent(arg_machine) < 0)
1315                 return -errno;
1316
1317         return 0;
1318 }
1319
1320 static int setup_journal(const char *directory) {
1321         sd_id128_t machine_id, this_id;
1322         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1323         char *id;
1324         int r;
1325
1326         /* Don't link journals in ephemeral mode */
1327         if (arg_ephemeral)
1328                 return 0;
1329
1330         p = strappend(directory, "/etc/machine-id");
1331         if (!p)
1332                 return log_oom();
1333
1334         r = read_one_line_file(p, &b);
1335         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1336                 return 0;
1337         else if (r < 0)
1338                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1339
1340         id = strstrip(b);
1341         if (isempty(id) && arg_link_journal == LINK_AUTO)
1342                 return 0;
1343
1344         /* Verify validity */
1345         r = sd_id128_from_string(id, &machine_id);
1346         if (r < 0)
1347                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1348
1349         r = sd_id128_get_machine(&this_id);
1350         if (r < 0)
1351                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1352
1353         if (sd_id128_equal(machine_id, this_id)) {
1354                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1355                          "Host and machine ids are equal (%s): refusing to link journals", id);
1356                 if (arg_link_journal == LINK_AUTO)
1357                         return 0;
1358                 return -EEXIST;
1359         }
1360
1361         if (arg_link_journal == LINK_NO)
1362                 return 0;
1363
1364         free(p);
1365         p = strappend("/var/log/journal/", id);
1366         q = strjoin(directory, "/var/log/journal/", id, NULL);
1367         if (!p || !q)
1368                 return log_oom();
1369
1370         if (path_is_mount_point(p, false) > 0) {
1371                 if (arg_link_journal != LINK_AUTO) {
1372                         log_error("%s: already a mount point, refusing to use for journal", p);
1373                         return -EEXIST;
1374                 }
1375
1376                 return 0;
1377         }
1378
1379         if (path_is_mount_point(q, false) > 0) {
1380                 if (arg_link_journal != LINK_AUTO) {
1381                         log_error("%s: already a mount point, refusing to use for journal", q);
1382                         return -EEXIST;
1383                 }
1384
1385                 return 0;
1386         }
1387
1388         r = readlink_and_make_absolute(p, &d);
1389         if (r >= 0) {
1390                 if ((arg_link_journal == LINK_GUEST ||
1391                      arg_link_journal == LINK_AUTO) &&
1392                     path_equal(d, q)) {
1393
1394                         r = mkdir_p(q, 0755);
1395                         if (r < 0)
1396                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1397                         return 0;
1398                 }
1399
1400                 if (unlink(p) < 0)
1401                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1402         } else if (r == -EINVAL) {
1403
1404                 if (arg_link_journal == LINK_GUEST &&
1405                     rmdir(p) < 0) {
1406
1407                         if (errno == ENOTDIR) {
1408                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1409                                 return r;
1410                         } else {
1411                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1412                                 return -errno;
1413                         }
1414                 }
1415         } else if (r != -ENOENT) {
1416                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1417                 return r;
1418         }
1419
1420         if (arg_link_journal == LINK_GUEST) {
1421
1422                 if (symlink(q, p) < 0) {
1423                         if (arg_link_journal_try) {
1424                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1425                                 return 0;
1426                         } else {
1427                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1428                                 return -errno;
1429                         }
1430                 }
1431
1432                 r = mkdir_p(q, 0755);
1433                 if (r < 0)
1434                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1435                 return 0;
1436         }
1437
1438         if (arg_link_journal == LINK_HOST) {
1439                 /* don't create parents here -- if the host doesn't have
1440                  * permanent journal set up, don't force it here */
1441                 r = mkdir(p, 0755);
1442                 if (r < 0) {
1443                         if (arg_link_journal_try) {
1444                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1445                                 return 0;
1446                         } else {
1447                                 log_error_errno(errno, "Failed to create %s: %m", p);
1448                                 return r;
1449                         }
1450                 }
1451
1452         } else if (access(p, F_OK) < 0)
1453                 return 0;
1454
1455         if (dir_is_empty(q) == 0)
1456                 log_warning("%s is not empty, proceeding anyway.", q);
1457
1458         r = mkdir_p(q, 0755);
1459         if (r < 0) {
1460                 log_error_errno(errno, "Failed to create %s: %m", q);
1461                 return r;
1462         }
1463
1464         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1465                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1466
1467         return 0;
1468 }
1469
1470 static int drop_capabilities(void) {
1471         return capability_bounding_set_drop(~arg_retain, false);
1472 }
1473
1474 static int register_machine(pid_t pid, int local_ifindex) {
1475         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1476         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1477         int r;
1478
1479         if (!arg_register)
1480                 return 0;
1481
1482         r = sd_bus_default_system(&bus);
1483         if (r < 0)
1484                 return log_error_errno(r, "Failed to open system bus: %m");
1485
1486         if (arg_keep_unit) {
1487                 r = sd_bus_call_method(
1488                                 bus,
1489                                 "org.freedesktop.machine1",
1490                                 "/org/freedesktop/machine1",
1491                                 "org.freedesktop.machine1.Manager",
1492                                 "RegisterMachineWithNetwork",
1493                                 &error,
1494                                 NULL,
1495                                 "sayssusai",
1496                                 arg_machine,
1497                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1498                                 "nspawn",
1499                                 "container",
1500                                 (uint32_t) pid,
1501                                 strempty(arg_directory),
1502                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1503         } else {
1504                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1505
1506                 r = sd_bus_message_new_method_call(
1507                                 bus,
1508                                 &m,
1509                                 "org.freedesktop.machine1",
1510                                 "/org/freedesktop/machine1",
1511                                 "org.freedesktop.machine1.Manager",
1512                                 "CreateMachineWithNetwork");
1513                 if (r < 0)
1514                         return log_error_errno(r, "Failed to create message: %m");
1515
1516                 r = sd_bus_message_append(
1517                                 m,
1518                                 "sayssusai",
1519                                 arg_machine,
1520                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1521                                 "nspawn",
1522                                 "container",
1523                                 (uint32_t) pid,
1524                                 strempty(arg_directory),
1525                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1526                 if (r < 0)
1527                         return log_error_errno(r, "Failed to append message arguments: %m");
1528
1529                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1530                 if (r < 0)
1531                         return log_error_errno(r, "Failed to open container: %m");
1532
1533                 if (!isempty(arg_slice)) {
1534                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1535                         if (r < 0)
1536                                 return log_error_errno(r, "Failed to append slice: %m");
1537                 }
1538
1539                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1540                 if (r < 0)
1541                         return log_error_errno(r, "Failed to add device policy: %m");
1542
1543                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1544                                           /* Allow the container to
1545                                            * access and create the API
1546                                            * device nodes, so that
1547                                            * PrivateDevices= in the
1548                                            * container can work
1549                                            * fine */
1550                                           "/dev/null", "rwm",
1551                                           "/dev/zero", "rwm",
1552                                           "/dev/full", "rwm",
1553                                           "/dev/random", "rwm",
1554                                           "/dev/urandom", "rwm",
1555                                           "/dev/tty", "rwm",
1556                                           "/dev/net/tun", "rwm",
1557                                           /* Allow the container
1558                                            * access to ptys. However,
1559                                            * do not permit the
1560                                            * container to ever create
1561                                            * these device nodes. */
1562                                           "/dev/pts/ptmx", "rw",
1563                                           "char-pts", "rw");
1564                 if (r < 0)
1565                         return log_error_errno(r, "Failed to add device whitelist: %m");
1566
1567                 r = sd_bus_message_close_container(m);
1568                 if (r < 0)
1569                         return log_error_errno(r, "Failed to close container: %m");
1570
1571                 r = sd_bus_call(bus, m, 0, &error, NULL);
1572         }
1573
1574         if (r < 0) {
1575                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1576                 return r;
1577         }
1578
1579         return 0;
1580 }
1581
1582 static int terminate_machine(pid_t pid) {
1583         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1584         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1585         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1586         const char *path;
1587         int r;
1588
1589         if (!arg_register)
1590                 return 0;
1591
1592         r = sd_bus_default_system(&bus);
1593         if (r < 0)
1594                 return log_error_errno(r, "Failed to open system bus: %m");
1595
1596         r = sd_bus_call_method(
1597                         bus,
1598                         "org.freedesktop.machine1",
1599                         "/org/freedesktop/machine1",
1600                         "org.freedesktop.machine1.Manager",
1601                         "GetMachineByPID",
1602                         &error,
1603                         &reply,
1604                         "u",
1605                         (uint32_t) pid);
1606         if (r < 0) {
1607                 /* Note that the machine might already have been
1608                  * cleaned up automatically, hence don't consider it a
1609                  * failure if we cannot get the machine object. */
1610                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1611                 return 0;
1612         }
1613
1614         r = sd_bus_message_read(reply, "o", &path);
1615         if (r < 0)
1616                 return bus_log_parse_error(r);
1617
1618         r = sd_bus_call_method(
1619                         bus,
1620                         "org.freedesktop.machine1",
1621                         path,
1622                         "org.freedesktop.machine1.Machine",
1623                         "Terminate",
1624                         &error,
1625                         NULL,
1626                         NULL);
1627         if (r < 0) {
1628                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1629                 return 0;
1630         }
1631
1632         return 0;
1633 }
1634
1635 static int reset_audit_loginuid(void) {
1636         _cleanup_free_ char *p = NULL;
1637         int r;
1638
1639         if (arg_share_system)
1640                 return 0;
1641
1642         r = read_one_line_file("/proc/self/loginuid", &p);
1643         if (r == -ENOENT)
1644                 return 0;
1645         if (r < 0)
1646                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1647
1648         /* Already reset? */
1649         if (streq(p, "4294967295"))
1650                 return 0;
1651
1652         r = write_string_file("/proc/self/loginuid", "4294967295");
1653         if (r < 0) {
1654                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1655                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1656                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1657                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1658                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1659
1660                 sleep(5);
1661         }
1662
1663         return 0;
1664 }
1665
1666 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1667 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1668 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1669
1670 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1671         uint8_t result[8];
1672         size_t l, sz;
1673         uint8_t *v, *i;
1674         int r;
1675
1676         l = strlen(arg_machine);
1677         sz = sizeof(sd_id128_t) + l;
1678         if (idx > 0)
1679                 sz += sizeof(idx);
1680
1681         v = alloca(sz);
1682
1683         /* fetch some persistent data unique to the host */
1684         r = sd_id128_get_machine((sd_id128_t*) v);
1685         if (r < 0)
1686                 return r;
1687
1688         /* combine with some data unique (on this host) to this
1689          * container instance */
1690         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1691         if (idx > 0) {
1692                 idx = htole64(idx);
1693                 memcpy(i, &idx, sizeof(idx));
1694         }
1695
1696         /* Let's hash the host machine ID plus the container name. We
1697          * use a fixed, but originally randomly created hash key here. */
1698         siphash24(result, v, sz, hash_key.bytes);
1699
1700         assert_cc(ETH_ALEN <= sizeof(result));
1701         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1702
1703         /* see eth_random_addr in the kernel */
1704         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1705         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1706
1707         return 0;
1708 }
1709
1710 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1711         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1712         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1713         struct ether_addr mac_host, mac_container;
1714         int r, i;
1715
1716         if (!arg_private_network)
1717                 return 0;
1718
1719         if (!arg_network_veth)
1720                 return 0;
1721
1722         /* Use two different interface name prefixes depending whether
1723          * we are in bridge mode or not. */
1724         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1725                  arg_network_bridge ? "vb" : "ve", arg_machine);
1726
1727         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1728         if (r < 0)
1729                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1730
1731         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1732         if (r < 0)
1733                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1734
1735         r = sd_rtnl_open(&rtnl, 0);
1736         if (r < 0)
1737                 return log_error_errno(r, "Failed to connect to netlink: %m");
1738
1739         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1740         if (r < 0)
1741                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1742
1743         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1744         if (r < 0)
1745                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1746
1747         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1748         if (r < 0)
1749                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1750
1751         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1752         if (r < 0)
1753                 return log_error_errno(r, "Failed to open netlink container: %m");
1754
1755         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1756         if (r < 0)
1757                 return log_error_errno(r, "Failed to open netlink container: %m");
1758
1759         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1760         if (r < 0)
1761                 return log_error_errno(r, "Failed to open netlink container: %m");
1762
1763         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1764         if (r < 0)
1765                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1766
1767         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1768         if (r < 0)
1769                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1770
1771         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1772         if (r < 0)
1773                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1774
1775         r = sd_rtnl_message_close_container(m);
1776         if (r < 0)
1777                 return log_error_errno(r, "Failed to close netlink container: %m");
1778
1779         r = sd_rtnl_message_close_container(m);
1780         if (r < 0)
1781                 return log_error_errno(r, "Failed to close netlink container: %m");
1782
1783         r = sd_rtnl_message_close_container(m);
1784         if (r < 0)
1785                 return log_error_errno(r, "Failed to close netlink container: %m");
1786
1787         r = sd_rtnl_call(rtnl, m, 0, NULL);
1788         if (r < 0)
1789                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1790
1791         i = (int) if_nametoindex(iface_name);
1792         if (i <= 0)
1793                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1794
1795         *ifi = i;
1796
1797         return 0;
1798 }
1799
1800 static int setup_bridge(const char veth_name[], int *ifi) {
1801         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1802         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1803         int r, bridge;
1804
1805         if (!arg_private_network)
1806                 return 0;
1807
1808         if (!arg_network_veth)
1809                 return 0;
1810
1811         if (!arg_network_bridge)
1812                 return 0;
1813
1814         bridge = (int) if_nametoindex(arg_network_bridge);
1815         if (bridge <= 0)
1816                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1817
1818         *ifi = bridge;
1819
1820         r = sd_rtnl_open(&rtnl, 0);
1821         if (r < 0)
1822                 return log_error_errno(r, "Failed to connect to netlink: %m");
1823
1824         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1825         if (r < 0)
1826                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1827
1828         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1829         if (r < 0)
1830                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1831
1832         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1833         if (r < 0)
1834                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1835
1836         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1837         if (r < 0)
1838                 return log_error_errno(r, "Failed to add netlink master field: %m");
1839
1840         r = sd_rtnl_call(rtnl, m, 0, NULL);
1841         if (r < 0)
1842                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1843
1844         return 0;
1845 }
1846
1847 static int parse_interface(struct udev *udev, const char *name) {
1848         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1849         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1850         int ifi;
1851
1852         ifi = (int) if_nametoindex(name);
1853         if (ifi <= 0)
1854                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1855
1856         sprintf(ifi_str, "n%i", ifi);
1857         d = udev_device_new_from_device_id(udev, ifi_str);
1858         if (!d)
1859                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1860
1861         if (udev_device_get_is_initialized(d) <= 0) {
1862                 log_error("Network interface %s is not initialized yet.", name);
1863                 return -EBUSY;
1864         }
1865
1866         return ifi;
1867 }
1868
1869 static int move_network_interfaces(pid_t pid) {
1870         _cleanup_udev_unref_ struct udev *udev = NULL;
1871         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1872         char **i;
1873         int r;
1874
1875         if (!arg_private_network)
1876                 return 0;
1877
1878         if (strv_isempty(arg_network_interfaces))
1879                 return 0;
1880
1881         r = sd_rtnl_open(&rtnl, 0);
1882         if (r < 0)
1883                 return log_error_errno(r, "Failed to connect to netlink: %m");
1884
1885         udev = udev_new();
1886         if (!udev) {
1887                 log_error("Failed to connect to udev.");
1888                 return -ENOMEM;
1889         }
1890
1891         STRV_FOREACH(i, arg_network_interfaces) {
1892                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1893                 int ifi;
1894
1895                 ifi = parse_interface(udev, *i);
1896                 if (ifi < 0)
1897                         return ifi;
1898
1899                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1900                 if (r < 0)
1901                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1902
1903                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1904                 if (r < 0)
1905                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1906
1907                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1908                 if (r < 0)
1909                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1910         }
1911
1912         return 0;
1913 }
1914
1915 static int setup_macvlan(pid_t pid) {
1916         _cleanup_udev_unref_ struct udev *udev = NULL;
1917         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1918         unsigned idx = 0;
1919         char **i;
1920         int r;
1921
1922         if (!arg_private_network)
1923                 return 0;
1924
1925         if (strv_isempty(arg_network_macvlan))
1926                 return 0;
1927
1928         r = sd_rtnl_open(&rtnl, 0);
1929         if (r < 0)
1930                 return log_error_errno(r, "Failed to connect to netlink: %m");
1931
1932         udev = udev_new();
1933         if (!udev) {
1934                 log_error("Failed to connect to udev.");
1935                 return -ENOMEM;
1936         }
1937
1938         STRV_FOREACH(i, arg_network_macvlan) {
1939                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1940                 _cleanup_free_ char *n = NULL;
1941                 struct ether_addr mac;
1942                 int ifi;
1943
1944                 ifi = parse_interface(udev, *i);
1945                 if (ifi < 0)
1946                         return ifi;
1947
1948                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1949                 if (r < 0)
1950                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1951
1952                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1953                 if (r < 0)
1954                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1955
1956                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1957                 if (r < 0)
1958                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1959
1960                 n = strappend("mv-", *i);
1961                 if (!n)
1962                         return log_oom();
1963
1964                 strshorten(n, IFNAMSIZ-1);
1965
1966                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1967                 if (r < 0)
1968                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1969
1970                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1971                 if (r < 0)
1972                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1973
1974                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1975                 if (r < 0)
1976                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1977
1978                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1979                 if (r < 0)
1980                         return log_error_errno(r, "Failed to open netlink container: %m");
1981
1982                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1983                 if (r < 0)
1984                         return log_error_errno(r, "Failed to open netlink container: %m");
1985
1986                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1987                 if (r < 0)
1988                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1989
1990                 r = sd_rtnl_message_close_container(m);
1991                 if (r < 0)
1992                         return log_error_errno(r, "Failed to close netlink container: %m");
1993
1994                 r = sd_rtnl_message_close_container(m);
1995                 if (r < 0)
1996                         return log_error_errno(r, "Failed to close netlink container: %m");
1997
1998                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1999                 if (r < 0)
2000                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2001         }
2002
2003         return 0;
2004 }
2005
2006 static int setup_seccomp(void) {
2007
2008 #ifdef HAVE_SECCOMP
2009         static const int blacklist[] = {
2010                 SCMP_SYS(kexec_load),
2011                 SCMP_SYS(open_by_handle_at),
2012                 SCMP_SYS(init_module),
2013                 SCMP_SYS(finit_module),
2014                 SCMP_SYS(delete_module),
2015                 SCMP_SYS(iopl),
2016                 SCMP_SYS(ioperm),
2017                 SCMP_SYS(swapon),
2018                 SCMP_SYS(swapoff),
2019         };
2020
2021         scmp_filter_ctx seccomp;
2022         unsigned i;
2023         int r;
2024
2025         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2026         if (!seccomp)
2027                 return log_oom();
2028
2029         r = seccomp_add_secondary_archs(seccomp);
2030         if (r < 0) {
2031                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2032                 goto finish;
2033         }
2034
2035         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2036                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2037                 if (r == -EFAULT)
2038                         continue; /* unknown syscall */
2039                 if (r < 0) {
2040                         log_error_errno(r, "Failed to block syscall: %m");
2041                         goto finish;
2042                 }
2043         }
2044
2045         /*
2046            Audit is broken in containers, much of the userspace audit
2047            hookup will fail if running inside a container. We don't
2048            care and just turn off creation of audit sockets.
2049
2050            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2051            with EAFNOSUPPORT which audit userspace uses as indication
2052            that audit is disabled in the kernel.
2053          */
2054
2055         r = seccomp_rule_add(
2056                         seccomp,
2057                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2058                         SCMP_SYS(socket),
2059                         2,
2060                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2061                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2062         if (r < 0) {
2063                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2064                 goto finish;
2065         }
2066
2067         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2068         if (r < 0) {
2069                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2070                 goto finish;
2071         }
2072
2073         r = seccomp_load(seccomp);
2074         if (r < 0)
2075                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2076
2077 finish:
2078         seccomp_release(seccomp);
2079         return r;
2080 #else
2081         return 0;
2082 #endif
2083
2084 }
2085
2086 static int setup_propagate(const char *root) {
2087         const char *p, *q;
2088
2089         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2090         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2091         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2092         (void) mkdir_p(p, 0600);
2093
2094         q = strappenda(root, "/run/systemd/nspawn/incoming");
2095         mkdir_parents(q, 0755);
2096         mkdir_p(q, 0600);
2097
2098         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2099                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2100
2101         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2102                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2103
2104         return 0;
2105 }
2106
2107 static int setup_image(char **device_path, int *loop_nr) {
2108         struct loop_info64 info = {
2109                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2110         };
2111         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2112         _cleanup_free_ char* loopdev = NULL;
2113         struct stat st;
2114         int r, nr;
2115
2116         assert(device_path);
2117         assert(loop_nr);
2118         assert(arg_image);
2119
2120         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2121         if (fd < 0)
2122                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2123
2124         if (fstat(fd, &st) < 0)
2125                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2126
2127         if (S_ISBLK(st.st_mode)) {
2128                 char *p;
2129
2130                 p = strdup(arg_image);
2131                 if (!p)
2132                         return log_oom();
2133
2134                 *device_path = p;
2135
2136                 *loop_nr = -1;
2137
2138                 r = fd;
2139                 fd = -1;
2140
2141                 return r;
2142         }
2143
2144         if (!S_ISREG(st.st_mode)) {
2145                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2146                 return -EINVAL;
2147         }
2148
2149         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2150         if (control < 0)
2151                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2152
2153         nr = ioctl(control, LOOP_CTL_GET_FREE);
2154         if (nr < 0)
2155                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2156
2157         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2158                 return log_oom();
2159
2160         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2161         if (loop < 0)
2162                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2163
2164         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2165                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2166
2167         if (arg_read_only)
2168                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2169
2170         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2171                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2172
2173         *device_path = loopdev;
2174         loopdev = NULL;
2175
2176         *loop_nr = nr;
2177
2178         r = loop;
2179         loop = -1;
2180
2181         return r;
2182 }
2183
2184 static int dissect_image(
2185                 int fd,
2186                 char **root_device, bool *root_device_rw,
2187                 char **home_device, bool *home_device_rw,
2188                 char **srv_device, bool *srv_device_rw,
2189                 bool *secondary) {
2190
2191 #ifdef HAVE_BLKID
2192         int home_nr = -1, srv_nr = -1;
2193 #ifdef GPT_ROOT_NATIVE
2194         int root_nr = -1;
2195 #endif
2196 #ifdef GPT_ROOT_SECONDARY
2197         int secondary_root_nr = -1;
2198 #endif
2199
2200         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2201         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2202         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2203         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2204         _cleanup_udev_unref_ struct udev *udev = NULL;
2205         struct udev_list_entry *first, *item;
2206         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2207         const char *pttype = NULL;
2208         blkid_partlist pl;
2209         struct stat st;
2210         int r;
2211
2212         assert(fd >= 0);
2213         assert(root_device);
2214         assert(home_device);
2215         assert(srv_device);
2216         assert(secondary);
2217         assert(arg_image);
2218
2219         b = blkid_new_probe();
2220         if (!b)
2221                 return log_oom();
2222
2223         errno = 0;
2224         r = blkid_probe_set_device(b, fd, 0, 0);
2225         if (r != 0) {
2226                 if (errno == 0)
2227                         return log_oom();
2228
2229                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2230                 return -errno;
2231         }
2232
2233         blkid_probe_enable_partitions(b, 1);
2234         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2235
2236         errno = 0;
2237         r = blkid_do_safeprobe(b);
2238         if (r == -2 || r == 1) {
2239                 log_error("Failed to identify any partition table on %s.\n"
2240                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2241                 return -EINVAL;
2242         } else if (r != 0) {
2243                 if (errno == 0)
2244                         errno = EIO;
2245                 log_error_errno(errno, "Failed to probe: %m");
2246                 return -errno;
2247         }
2248
2249         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2250         if (!streq_ptr(pttype, "gpt")) {
2251                 log_error("Image %s does not carry a GUID Partition Table.\n"
2252                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2253                 return -EINVAL;
2254         }
2255
2256         errno = 0;
2257         pl = blkid_probe_get_partitions(b);
2258         if (!pl) {
2259                 if (errno == 0)
2260                         return log_oom();
2261
2262                 log_error("Failed to list partitions of %s", arg_image);
2263                 return -errno;
2264         }
2265
2266         udev = udev_new();
2267         if (!udev)
2268                 return log_oom();
2269
2270         if (fstat(fd, &st) < 0)
2271                 return log_error_errno(errno, "Failed to stat block device: %m");
2272
2273         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2274         if (!d)
2275                 return log_oom();
2276
2277         e = udev_enumerate_new(udev);
2278         if (!e)
2279                 return log_oom();
2280
2281         r = udev_enumerate_add_match_parent(e, d);
2282         if (r < 0)
2283                 return log_oom();
2284
2285         r = udev_enumerate_scan_devices(e);
2286         if (r < 0)
2287                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2288
2289         first = udev_enumerate_get_list_entry(e);
2290         udev_list_entry_foreach(item, first) {
2291                 _cleanup_udev_device_unref_ struct udev_device *q;
2292                 const char *stype, *node;
2293                 unsigned long long flags;
2294                 sd_id128_t type_id;
2295                 blkid_partition pp;
2296                 dev_t qn;
2297                 int nr;
2298
2299                 errno = 0;
2300                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2301                 if (!q) {
2302                         if (!errno)
2303                                 errno = ENOMEM;
2304
2305                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2306                         return -errno;
2307                 }
2308
2309                 qn = udev_device_get_devnum(q);
2310                 if (major(qn) == 0)
2311                         continue;
2312
2313                 if (st.st_rdev == qn)
2314                         continue;
2315
2316                 node = udev_device_get_devnode(q);
2317                 if (!node)
2318                         continue;
2319
2320                 pp = blkid_partlist_devno_to_partition(pl, qn);
2321                 if (!pp)
2322                         continue;
2323
2324                 flags = blkid_partition_get_flags(pp);
2325                 if (flags & GPT_FLAG_NO_AUTO)
2326                         continue;
2327
2328                 nr = blkid_partition_get_partno(pp);
2329                 if (nr < 0)
2330                         continue;
2331
2332                 stype = blkid_partition_get_type_string(pp);
2333                 if (!stype)
2334                         continue;
2335
2336                 if (sd_id128_from_string(stype, &type_id) < 0)
2337                         continue;
2338
2339                 if (sd_id128_equal(type_id, GPT_HOME)) {
2340
2341                         if (home && nr >= home_nr)
2342                                 continue;
2343
2344                         home_nr = nr;
2345                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2346
2347                         free(home);
2348                         home = strdup(node);
2349                         if (!home)
2350                                 return log_oom();
2351                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2352
2353                         if (srv && nr >= srv_nr)
2354                                 continue;
2355
2356                         srv_nr = nr;
2357                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2358
2359                         free(srv);
2360                         srv = strdup(node);
2361                         if (!srv)
2362                                 return log_oom();
2363                 }
2364 #ifdef GPT_ROOT_NATIVE
2365                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2366
2367                         if (root && nr >= root_nr)
2368                                 continue;
2369
2370                         root_nr = nr;
2371                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2372
2373                         free(root);
2374                         root = strdup(node);
2375                         if (!root)
2376                                 return log_oom();
2377                 }
2378 #endif
2379 #ifdef GPT_ROOT_SECONDARY
2380                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2381
2382                         if (secondary_root && nr >= secondary_root_nr)
2383                                 continue;
2384
2385                         secondary_root_nr = nr;
2386                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2387
2388
2389                         free(secondary_root);
2390                         secondary_root = strdup(node);
2391                         if (!secondary_root)
2392                                 return log_oom();
2393                 }
2394 #endif
2395         }
2396
2397         if (!root && !secondary_root) {
2398                 log_error("Failed to identify root partition in disk image %s.\n"
2399                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2400                 return -EINVAL;
2401         }
2402
2403         if (root) {
2404                 *root_device = root;
2405                 root = NULL;
2406
2407                 *root_device_rw = root_rw;
2408                 *secondary = false;
2409         } else if (secondary_root) {
2410                 *root_device = secondary_root;
2411                 secondary_root = NULL;
2412
2413                 *root_device_rw = secondary_root_rw;
2414                 *secondary = true;
2415         }
2416
2417         if (home) {
2418                 *home_device = home;
2419                 home = NULL;
2420
2421                 *home_device_rw = home_rw;
2422         }
2423
2424         if (srv) {
2425                 *srv_device = srv;
2426                 srv = NULL;
2427
2428                 *srv_device_rw = srv_rw;
2429         }
2430
2431         return 0;
2432 #else
2433         log_error("--image= is not supported, compiled without blkid support.");
2434         return -ENOTSUP;
2435 #endif
2436 }
2437
2438 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2439 #ifdef HAVE_BLKID
2440         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2441         const char *fstype, *p;
2442         int r;
2443
2444         assert(what);
2445         assert(where);
2446
2447         if (arg_read_only)
2448                 rw = false;
2449
2450         if (directory)
2451                 p = strappenda(where, directory);
2452         else
2453                 p = where;
2454
2455         errno = 0;
2456         b = blkid_new_probe_from_filename(what);
2457         if (!b) {
2458                 if (errno == 0)
2459                         return log_oom();
2460                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2461                 return -errno;
2462         }
2463
2464         blkid_probe_enable_superblocks(b, 1);
2465         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2466
2467         errno = 0;
2468         r = blkid_do_safeprobe(b);
2469         if (r == -1 || r == 1) {
2470                 log_error("Cannot determine file system type of %s", what);
2471                 return -EINVAL;
2472         } else if (r != 0) {
2473                 if (errno == 0)
2474                         errno = EIO;
2475                 log_error_errno(errno, "Failed to probe %s: %m", what);
2476                 return -errno;
2477         }
2478
2479         errno = 0;
2480         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2481                 if (errno == 0)
2482                         errno = EINVAL;
2483                 log_error("Failed to determine file system type of %s", what);
2484                 return -errno;
2485         }
2486
2487         if (streq(fstype, "crypto_LUKS")) {
2488                 log_error("nspawn currently does not support LUKS disk images.");
2489                 return -ENOTSUP;
2490         }
2491
2492         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2493                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2494
2495         return 0;
2496 #else
2497         log_error("--image= is not supported, compiled without blkid support.");
2498         return -ENOTSUP;
2499 #endif
2500 }
2501
2502 static int mount_devices(
2503                 const char *where,
2504                 const char *root_device, bool root_device_rw,
2505                 const char *home_device, bool home_device_rw,
2506                 const char *srv_device, bool srv_device_rw) {
2507         int r;
2508
2509         assert(where);
2510
2511         if (root_device) {
2512                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2513                 if (r < 0)
2514                         return log_error_errno(r, "Failed to mount root directory: %m");
2515         }
2516
2517         if (home_device) {
2518                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2519                 if (r < 0)
2520                         return log_error_errno(r, "Failed to mount home directory: %m");
2521         }
2522
2523         if (srv_device) {
2524                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2525                 if (r < 0)
2526                         return log_error_errno(r, "Failed to mount server data directory: %m");
2527         }
2528
2529         return 0;
2530 }
2531
2532 static void loop_remove(int nr, int *image_fd) {
2533         _cleanup_close_ int control = -1;
2534         int r;
2535
2536         if (nr < 0)
2537                 return;
2538
2539         if (image_fd && *image_fd >= 0) {
2540                 r = ioctl(*image_fd, LOOP_CLR_FD);
2541                 if (r < 0)
2542                         log_warning_errno(errno, "Failed to close loop image: %m");
2543                 *image_fd = safe_close(*image_fd);
2544         }
2545
2546         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2547         if (control < 0) {
2548                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2549                 return;
2550         }
2551
2552         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2553         if (r < 0)
2554                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2555 }
2556
2557 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2558         int pipe_fds[2];
2559         pid_t pid;
2560
2561         assert(database);
2562         assert(key);
2563         assert(rpid);
2564
2565         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2566                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2567
2568         pid = fork();
2569         if (pid < 0)
2570                 return log_error_errno(errno, "Failed to fork getent child: %m");
2571         else if (pid == 0) {
2572                 int nullfd;
2573                 char *empty_env = NULL;
2574
2575                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2576                         _exit(EXIT_FAILURE);
2577
2578                 if (pipe_fds[0] > 2)
2579                         safe_close(pipe_fds[0]);
2580                 if (pipe_fds[1] > 2)
2581                         safe_close(pipe_fds[1]);
2582
2583                 nullfd = open("/dev/null", O_RDWR);
2584                 if (nullfd < 0)
2585                         _exit(EXIT_FAILURE);
2586
2587                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2588                         _exit(EXIT_FAILURE);
2589
2590                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2591                         _exit(EXIT_FAILURE);
2592
2593                 if (nullfd > 2)
2594                         safe_close(nullfd);
2595
2596                 reset_all_signal_handlers();
2597                 close_all_fds(NULL, 0);
2598
2599                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2600                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2601                 _exit(EXIT_FAILURE);
2602         }
2603
2604         pipe_fds[1] = safe_close(pipe_fds[1]);
2605
2606         *rpid = pid;
2607
2608         return pipe_fds[0];
2609 }
2610
2611 static int change_uid_gid(char **_home) {
2612         char line[LINE_MAX], *x, *u, *g, *h;
2613         const char *word, *state;
2614         _cleanup_free_ uid_t *uids = NULL;
2615         _cleanup_free_ char *home = NULL;
2616         _cleanup_fclose_ FILE *f = NULL;
2617         _cleanup_close_ int fd = -1;
2618         unsigned n_uids = 0;
2619         size_t sz = 0, l;
2620         uid_t uid;
2621         gid_t gid;
2622         pid_t pid;
2623         int r;
2624
2625         assert(_home);
2626
2627         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2628                 /* Reset everything fully to 0, just in case */
2629
2630                 if (setgroups(0, NULL) < 0)
2631                         return log_error_errno(errno, "setgroups() failed: %m");
2632
2633                 if (setresgid(0, 0, 0) < 0)
2634                         return log_error_errno(errno, "setregid() failed: %m");
2635
2636                 if (setresuid(0, 0, 0) < 0)
2637                         return log_error_errno(errno, "setreuid() failed: %m");
2638
2639                 *_home = NULL;
2640                 return 0;
2641         }
2642
2643         /* First, get user credentials */
2644         fd = spawn_getent("passwd", arg_user, &pid);
2645         if (fd < 0)
2646                 return fd;
2647
2648         f = fdopen(fd, "r");
2649         if (!f)
2650                 return log_oom();
2651         fd = -1;
2652
2653         if (!fgets(line, sizeof(line), f)) {
2654
2655                 if (!ferror(f)) {
2656                         log_error("Failed to resolve user %s.", arg_user);
2657                         return -ESRCH;
2658                 }
2659
2660                 log_error_errno(errno, "Failed to read from getent: %m");
2661                 return -errno;
2662         }
2663
2664         truncate_nl(line);
2665
2666         wait_for_terminate_and_warn("getent passwd", pid, true);
2667
2668         x = strchr(line, ':');
2669         if (!x) {
2670                 log_error("/etc/passwd entry has invalid user field.");
2671                 return -EIO;
2672         }
2673
2674         u = strchr(x+1, ':');
2675         if (!u) {
2676                 log_error("/etc/passwd entry has invalid password field.");
2677                 return -EIO;
2678         }
2679
2680         u++;
2681         g = strchr(u, ':');
2682         if (!g) {
2683                 log_error("/etc/passwd entry has invalid UID field.");
2684                 return -EIO;
2685         }
2686
2687         *g = 0;
2688         g++;
2689         x = strchr(g, ':');
2690         if (!x) {
2691                 log_error("/etc/passwd entry has invalid GID field.");
2692                 return -EIO;
2693         }
2694
2695         *x = 0;
2696         h = strchr(x+1, ':');
2697         if (!h) {
2698                 log_error("/etc/passwd entry has invalid GECOS field.");
2699                 return -EIO;
2700         }
2701
2702         h++;
2703         x = strchr(h, ':');
2704         if (!x) {
2705                 log_error("/etc/passwd entry has invalid home directory field.");
2706                 return -EIO;
2707         }
2708
2709         *x = 0;
2710
2711         r = parse_uid(u, &uid);
2712         if (r < 0) {
2713                 log_error("Failed to parse UID of user.");
2714                 return -EIO;
2715         }
2716
2717         r = parse_gid(g, &gid);
2718         if (r < 0) {
2719                 log_error("Failed to parse GID of user.");
2720                 return -EIO;
2721         }
2722
2723         home = strdup(h);
2724         if (!home)
2725                 return log_oom();
2726
2727         /* Second, get group memberships */
2728         fd = spawn_getent("initgroups", arg_user, &pid);
2729         if (fd < 0)
2730                 return fd;
2731
2732         fclose(f);
2733         f = fdopen(fd, "r");
2734         if (!f)
2735                 return log_oom();
2736         fd = -1;
2737
2738         if (!fgets(line, sizeof(line), f)) {
2739                 if (!ferror(f)) {
2740                         log_error("Failed to resolve user %s.", arg_user);
2741                         return -ESRCH;
2742                 }
2743
2744                 log_error_errno(errno, "Failed to read from getent: %m");
2745                 return -errno;
2746         }
2747
2748         truncate_nl(line);
2749
2750         wait_for_terminate_and_warn("getent initgroups", pid, true);
2751
2752         /* Skip over the username and subsequent separator whitespace */
2753         x = line;
2754         x += strcspn(x, WHITESPACE);
2755         x += strspn(x, WHITESPACE);
2756
2757         FOREACH_WORD(word, l, x, state) {
2758                 char c[l+1];
2759
2760                 memcpy(c, word, l);
2761                 c[l] = 0;
2762
2763                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2764                         return log_oom();
2765
2766                 r = parse_uid(c, &uids[n_uids++]);
2767                 if (r < 0) {
2768                         log_error("Failed to parse group data from getent.");
2769                         return -EIO;
2770                 }
2771         }
2772
2773         r = mkdir_parents(home, 0775);
2774         if (r < 0)
2775                 return log_error_errno(r, "Failed to make home root directory: %m");
2776
2777         r = mkdir_safe(home, 0755, uid, gid);
2778         if (r < 0 && r != -EEXIST)
2779                 return log_error_errno(r, "Failed to make home directory: %m");
2780
2781         fchown(STDIN_FILENO, uid, gid);
2782         fchown(STDOUT_FILENO, uid, gid);
2783         fchown(STDERR_FILENO, uid, gid);
2784
2785         if (setgroups(n_uids, uids) < 0)
2786                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2787
2788         if (setresgid(gid, gid, gid) < 0)
2789                 return log_error_errno(errno, "setregid() failed: %m");
2790
2791         if (setresuid(uid, uid, uid) < 0)
2792                 return log_error_errno(errno, "setreuid() failed: %m");
2793
2794         if (_home) {
2795                 *_home = home;
2796                 home = NULL;
2797         }
2798
2799         return 0;
2800 }
2801
2802 /*
2803  * Return values:
2804  * < 0 : wait_for_terminate() failed to get the state of the
2805  *       container, the container was terminated by a signal, or
2806  *       failed for an unknown reason.  No change is made to the
2807  *       container argument.
2808  * > 0 : The program executed in the container terminated with an
2809  *       error.  The exit code of the program executed in the
2810  *       container is returned.  The container argument has been set
2811  *       to CONTAINER_TERMINATED.
2812  *   0 : The container is being rebooted, has been shut down or exited
2813  *       successfully.  The container argument has been set to either
2814  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2815  *
2816  * That is, success is indicated by a return value of zero, and an
2817  * error is indicated by a non-zero value.
2818  */
2819 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2820         siginfo_t status;
2821         int r;
2822
2823         r = wait_for_terminate(pid, &status);
2824         if (r < 0)
2825                 return log_warning_errno(r, "Failed to wait for container: %m");
2826
2827         switch (status.si_code) {
2828
2829         case CLD_EXITED:
2830                 if (status.si_status == 0) {
2831                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2832
2833                 } else
2834                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2835
2836                 *container = CONTAINER_TERMINATED;
2837                 return status.si_status;
2838
2839         case CLD_KILLED:
2840                 if (status.si_status == SIGINT) {
2841
2842                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2843                         *container = CONTAINER_TERMINATED;
2844                         return 0;
2845
2846                 } else if (status.si_status == SIGHUP) {
2847
2848                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2849                         *container = CONTAINER_REBOOTED;
2850                         return 0;
2851                 }
2852
2853                 /* CLD_KILLED fallthrough */
2854
2855         case CLD_DUMPED:
2856                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2857                 return -EIO;
2858
2859         default:
2860                 log_error("Container %s failed due to unknown reason.", arg_machine);
2861                 return -EIO;
2862         }
2863
2864         return r;
2865 }
2866
2867 static void nop_handler(int sig) {}
2868
2869 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2870         pid_t pid;
2871
2872         pid = PTR_TO_UINT32(userdata);
2873         if (pid > 0) {
2874                 if (kill(pid, SIGRTMIN+3) >= 0) {
2875                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2876                         sd_event_source_set_userdata(s, NULL);
2877                         return 0;
2878                 }
2879         }
2880
2881         sd_event_exit(sd_event_source_get_event(s), 0);
2882         return 0;
2883 }
2884
2885 static int determine_names(void) {
2886         int r;
2887
2888         if (!arg_image && !arg_directory) {
2889                 if (arg_machine) {
2890                         _cleanup_(image_unrefp) Image *i = NULL;
2891
2892                         r = image_find(arg_machine, &i);
2893                         if (r < 0)
2894                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
2895                         else if (r == 0) {
2896                                 log_error("No image for machine '%s': %m", arg_machine);
2897                                 return -ENOENT;
2898                         }
2899
2900                         if (i->type == IMAGE_GPT)
2901                                 r = set_sanitized_path(&arg_image, i->path);
2902                         else
2903                                 r = set_sanitized_path(&arg_directory, i->path);
2904                         if (r < 0)
2905                                 return log_error_errno(r, "Invalid image directory: %m");
2906
2907                         arg_read_only = arg_read_only || i->read_only;
2908                 } else
2909                         arg_directory = get_current_dir_name();
2910
2911                 if (!arg_directory && !arg_machine) {
2912                         log_error("Failed to determine path, please use -D or -i.");
2913                         return -EINVAL;
2914                 }
2915         }
2916
2917         if (!arg_machine) {
2918                 if (arg_directory && path_equal(arg_directory, "/"))
2919                         arg_machine = gethostname_malloc();
2920                 else
2921                         arg_machine = strdup(basename(arg_image ?: arg_directory));
2922
2923                 if (!arg_machine)
2924                         return log_oom();
2925
2926                 hostname_cleanup(arg_machine, false);
2927                 if (!machine_name_is_valid(arg_machine)) {
2928                         log_error("Failed to determine machine name automatically, please use -M.");
2929                         return -EINVAL;
2930                 }
2931
2932                 if (arg_ephemeral) {
2933                         char *b;
2934
2935                         /* Add a random suffix when this is an
2936                          * ephemeral machine, so that we can run many
2937                          * instances at once without manually having
2938                          * to specify -M each time. */
2939
2940                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
2941                                 return log_oom();
2942
2943                         free(arg_machine);
2944                         arg_machine = b;
2945                 }
2946         }
2947
2948         return 0;
2949 }
2950
2951 int main(int argc, char *argv[]) {
2952
2953         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
2954         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2955         _cleanup_close_ int master = -1, image_fd = -1;
2956         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2957         _cleanup_fdset_free_ FDSet *fds = NULL;
2958         int r, n_fd_passed, loop_nr = -1;
2959         char veth_name[IFNAMSIZ];
2960         bool secondary = false, remove_subvol = false;
2961         sigset_t mask, mask_chld;
2962         pid_t pid = 0;
2963         int ret = EXIT_SUCCESS;
2964
2965         log_parse_environment();
2966         log_open();
2967
2968         r = parse_argv(argc, argv);
2969         if (r <= 0)
2970                 goto finish;
2971
2972         r = determine_names();
2973         if (r < 0)
2974                 goto finish;
2975
2976         if (geteuid() != 0) {
2977                 log_error("Need to be root.");
2978                 r = -EPERM;
2979                 goto finish;
2980         }
2981
2982         if (sd_booted() <= 0) {
2983                 log_error("Not running on a systemd system.");
2984                 r = -EINVAL;
2985                 goto finish;
2986         }
2987
2988         log_close();
2989         n_fd_passed = sd_listen_fds(false);
2990         if (n_fd_passed > 0) {
2991                 r = fdset_new_listen_fds(&fds, false);
2992                 if (r < 0) {
2993                         log_error_errno(r, "Failed to collect file descriptors: %m");
2994                         goto finish;
2995                 }
2996         }
2997         fdset_close_others(fds);
2998         log_open();
2999
3000         if (arg_directory) {
3001                 assert(!arg_image);
3002
3003                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3004                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3005                         r = -EINVAL;
3006                         goto finish;
3007                 }
3008
3009                 if (arg_template) {
3010                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3011                         if (r == -EEXIST) {
3012                                 if (!arg_quiet)
3013                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3014                         } else if (r < 0) {
3015                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3016                                 goto finish;
3017                         } else {
3018                                 if (!arg_quiet)
3019                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
3020                         }
3021
3022                 } else if (arg_ephemeral) {
3023                         char *np;
3024
3025                         /* If the specified path is a mount point we
3026                          * generate the new snapshot immediately
3027                          * inside it under a random name. However if
3028                          * the specified is not a mount point we
3029                          * create the new snapshot in the parent
3030                          * directory, just next to it. */
3031                         r = path_is_mount_point(arg_directory, false);
3032                         if (r < 0) {
3033                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3034                                 goto finish;
3035                         }
3036                         if (r > 0)
3037                                 r = tempfn_random_child(arg_directory, &np);
3038                         else
3039                                 r = tempfn_random(arg_directory, &np);
3040                         if (r < 0) {
3041                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3042                                 goto finish;
3043                         }
3044
3045                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3046                         if (r < 0) {
3047                                 free(np);
3048                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3049                                 goto finish;
3050                         }
3051
3052                         free(arg_directory);
3053                         arg_directory = np;
3054
3055                         remove_subvol = true;
3056                 }
3057
3058                 if (arg_boot) {
3059                         if (path_is_os_tree(arg_directory) <= 0) {
3060                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3061                                 r = -EINVAL;
3062                                 goto finish;
3063                         }
3064                 } else {
3065                         const char *p;
3066
3067                         p = strappenda(arg_directory,
3068                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3069                         if (access(p, F_OK) < 0) {
3070                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3071                                 r = -EINVAL;
3072                                 goto finish;
3073                         }
3074                 }
3075
3076         } else {
3077                 char template[] = "/tmp/nspawn-root-XXXXXX";
3078
3079                 assert(arg_image);
3080                 assert(!arg_template);
3081
3082                 if (!mkdtemp(template)) {
3083                         log_error_errno(errno, "Failed to create temporary directory: %m");
3084                         r = -errno;
3085                         goto finish;
3086                 }
3087
3088                 arg_directory = strdup(template);
3089                 if (!arg_directory) {
3090                         r = log_oom();
3091                         goto finish;
3092                 }
3093
3094                 image_fd = setup_image(&device_path, &loop_nr);
3095                 if (image_fd < 0) {
3096                         r = image_fd;
3097                         goto finish;
3098                 }
3099
3100                 r = dissect_image(image_fd,
3101                                   &root_device, &root_device_rw,
3102                                   &home_device, &home_device_rw,
3103                                   &srv_device, &srv_device_rw,
3104                                   &secondary);
3105                 if (r < 0)
3106                         goto finish;
3107         }
3108
3109         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3110         if (master < 0) {
3111                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3112                 goto finish;
3113         }
3114
3115         r = ptsname_malloc(master, &console);
3116         if (r < 0) {
3117                 r = log_error_errno(r, "Failed to determine tty name: %m");
3118                 goto finish;
3119         }
3120
3121         if (!arg_quiet)
3122                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3123                          arg_machine, arg_image ?: arg_directory);
3124
3125         if (unlockpt(master) < 0) {
3126                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3127                 goto finish;
3128         }
3129
3130         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3131                 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3132                 goto finish;
3133         }
3134
3135         sd_notify(false,
3136                   "READY=1\n"
3137                   "STATUS=Container running.");
3138
3139         assert_se(sigemptyset(&mask) == 0);
3140         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3141         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3142
3143         assert_se(sigemptyset(&mask_chld) == 0);
3144         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3145
3146         for (;;) {
3147                 ContainerStatus container_status;
3148                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3149                 struct sigaction sa = {
3150                         .sa_handler = nop_handler,
3151                         .sa_flags = SA_NOCLDSTOP,
3152                 };
3153
3154                 r = barrier_create(&barrier);
3155                 if (r < 0) {
3156                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3157                         goto finish;
3158                 }
3159
3160                 /* Child can be killed before execv(), so handle SIGCHLD
3161                  * in order to interrupt parent's blocking calls and
3162                  * give it a chance to call wait() and terminate. */
3163                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3164                 if (r < 0) {
3165                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3166                         goto finish;
3167                 }
3168
3169                 r = sigaction(SIGCHLD, &sa, NULL);
3170                 if (r < 0) {
3171                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3172                         goto finish;
3173                 }
3174
3175                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3176                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3177                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3178                 if (pid < 0) {
3179                         if (errno == EINVAL)
3180                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3181                         else
3182                                 r = log_error_errno(errno, "clone() failed: %m");
3183
3184                         goto finish;
3185                 }
3186
3187                 if (pid == 0) {
3188                         /* child */
3189                         _cleanup_free_ char *home = NULL;
3190                         unsigned n_env = 2;
3191                         const char *envp[] = {
3192                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3193                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3194                                 NULL, /* TERM */
3195                                 NULL, /* HOME */
3196                                 NULL, /* USER */
3197                                 NULL, /* LOGNAME */
3198                                 NULL, /* container_uuid */
3199                                 NULL, /* LISTEN_FDS */
3200                                 NULL, /* LISTEN_PID */
3201                                 NULL
3202                         };
3203                         char **env_use;
3204
3205                         barrier_set_role(&barrier, BARRIER_CHILD);
3206
3207                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3208                         if (envp[n_env])
3209                                 n_env ++;
3210
3211                         master = safe_close(master);
3212
3213                         close_nointr(STDIN_FILENO);
3214                         close_nointr(STDOUT_FILENO);
3215                         close_nointr(STDERR_FILENO);
3216
3217                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3218
3219                         reset_all_signal_handlers();
3220                         reset_signal_mask();
3221
3222                         r = open_terminal(console, O_RDWR);
3223                         if (r != STDIN_FILENO) {
3224                                 if (r >= 0) {
3225                                         safe_close(r);
3226                                         r = -EINVAL;
3227                                 }
3228
3229                                 log_error_errno(r, "Failed to open console: %m");
3230                                 _exit(EXIT_FAILURE);
3231                         }
3232
3233                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3234                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3235                                 log_error_errno(errno, "Failed to duplicate console: %m");
3236                                 _exit(EXIT_FAILURE);
3237                         }
3238
3239                         if (setsid() < 0) {
3240                                 log_error_errno(errno, "setsid() failed: %m");
3241                                 _exit(EXIT_FAILURE);
3242                         }
3243
3244                         if (reset_audit_loginuid() < 0)
3245                                 _exit(EXIT_FAILURE);
3246
3247                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3248                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3249                                 _exit(EXIT_FAILURE);
3250                         }
3251
3252                         /* Mark everything as slave, so that we still
3253                          * receive mounts from the real root, but don't
3254                          * propagate mounts to the real root. */
3255                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3256                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3257                                 _exit(EXIT_FAILURE);
3258                         }
3259
3260                         if (mount_devices(arg_directory,
3261                                           root_device, root_device_rw,
3262                                           home_device, home_device_rw,
3263                                           srv_device, srv_device_rw) < 0)
3264                                 _exit(EXIT_FAILURE);
3265
3266                         /* Turn directory into bind mount */
3267                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3268                                 log_error_errno(errno, "Failed to make bind mount: %m");
3269                                 _exit(EXIT_FAILURE);
3270                         }
3271
3272                         r = setup_volatile(arg_directory);
3273                         if (r < 0)
3274                                 _exit(EXIT_FAILURE);
3275
3276                         if (setup_volatile_state(arg_directory) < 0)
3277                                 _exit(EXIT_FAILURE);
3278
3279                         r = base_filesystem_create(arg_directory);
3280                         if (r < 0)
3281                                 _exit(EXIT_FAILURE);
3282
3283                         if (arg_read_only) {
3284                                 r = bind_remount_recursive(arg_directory, true);
3285                                 if (r < 0) {
3286                                         log_error_errno(r, "Failed to make tree read-only: %m");
3287                                         _exit(EXIT_FAILURE);
3288                                 }
3289                         }
3290
3291                         if (mount_all(arg_directory) < 0)
3292                                 _exit(EXIT_FAILURE);
3293
3294                         if (copy_devnodes(arg_directory) < 0)
3295                                 _exit(EXIT_FAILURE);
3296
3297                         if (setup_ptmx(arg_directory) < 0)
3298                                 _exit(EXIT_FAILURE);
3299
3300                         dev_setup(arg_directory);
3301
3302                         if (setup_propagate(arg_directory) < 0)
3303                                 _exit(EXIT_FAILURE);
3304
3305                         if (setup_seccomp() < 0)
3306                                 _exit(EXIT_FAILURE);
3307
3308                         if (setup_dev_console(arg_directory, console) < 0)
3309                                 _exit(EXIT_FAILURE);
3310
3311                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3312                                 _exit(EXIT_FAILURE);
3313
3314                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3315
3316                         if (setup_boot_id(arg_directory) < 0)
3317                                 _exit(EXIT_FAILURE);
3318
3319                         if (setup_timezone(arg_directory) < 0)
3320                                 _exit(EXIT_FAILURE);
3321
3322                         if (setup_resolv_conf(arg_directory) < 0)
3323                                 _exit(EXIT_FAILURE);
3324
3325                         if (setup_journal(arg_directory) < 0)
3326                                 _exit(EXIT_FAILURE);
3327
3328                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3329                                 _exit(EXIT_FAILURE);
3330
3331                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3332                                 _exit(EXIT_FAILURE);
3333
3334                         if (mount_tmpfs(arg_directory) < 0)
3335                                 _exit(EXIT_FAILURE);
3336
3337                         /* Tell the parent that we are ready, and that
3338                          * it can cgroupify us to that we lack access
3339                          * to certain devices and resources. */
3340                         (void)barrier_place(&barrier);
3341
3342                         if (chdir(arg_directory) < 0) {
3343                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3344                                 _exit(EXIT_FAILURE);
3345                         }
3346
3347                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3348                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3349                                 _exit(EXIT_FAILURE);
3350                         }
3351
3352                         if (chroot(".") < 0) {
3353                                 log_error_errno(errno, "chroot() failed: %m");
3354                                 _exit(EXIT_FAILURE);
3355                         }
3356
3357                         if (chdir("/") < 0) {
3358                                 log_error_errno(errno, "chdir() failed: %m");
3359                                 _exit(EXIT_FAILURE);
3360                         }
3361
3362                         umask(0022);
3363
3364                         if (arg_private_network)
3365                                 loopback_setup();
3366
3367                         if (drop_capabilities() < 0) {
3368                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3369                                 _exit(EXIT_FAILURE);
3370                         }
3371
3372                         r = change_uid_gid(&home);
3373                         if (r < 0)
3374                                 _exit(EXIT_FAILURE);
3375
3376                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3377                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3378                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3379                                 log_oom();
3380                                 _exit(EXIT_FAILURE);
3381                         }
3382
3383                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3384                                 char as_uuid[37];
3385
3386                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3387                                         log_oom();
3388                                         _exit(EXIT_FAILURE);
3389                                 }
3390                         }
3391
3392                         if (fdset_size(fds) > 0) {
3393                                 r = fdset_cloexec(fds, false);
3394                                 if (r < 0) {
3395                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3396                                         _exit(EXIT_FAILURE);
3397                                 }
3398
3399                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3400                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3401                                         log_oom();
3402                                         _exit(EXIT_FAILURE);
3403                                 }
3404                         }
3405
3406                         setup_hostname();
3407
3408                         if (arg_personality != 0xffffffffLU) {
3409                                 if (personality(arg_personality) < 0) {
3410                                         log_error_errno(errno, "personality() failed: %m");
3411                                         _exit(EXIT_FAILURE);
3412                                 }
3413                         } else if (secondary) {
3414                                 if (personality(PER_LINUX32) < 0) {
3415                                         log_error_errno(errno, "personality() failed: %m");
3416                                         _exit(EXIT_FAILURE);
3417                                 }
3418                         }
3419
3420 #ifdef HAVE_SELINUX
3421                         if (arg_selinux_context)
3422                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3423                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3424                                         _exit(EXIT_FAILURE);
3425                                 }
3426 #endif
3427
3428                         if (!strv_isempty(arg_setenv)) {
3429                                 char **n;
3430
3431                                 n = strv_env_merge(2, envp, arg_setenv);
3432                                 if (!n) {
3433                                         log_oom();
3434                                         _exit(EXIT_FAILURE);
3435                                 }
3436
3437                                 env_use = n;
3438                         } else
3439                                 env_use = (char**) envp;
3440
3441                         /* Wait until the parent is ready with the setup, too... */
3442                         if (!barrier_place_and_sync(&barrier))
3443                                 _exit(EXIT_FAILURE);
3444
3445                         if (arg_boot) {
3446                                 char **a;
3447                                 size_t l;
3448
3449                                 /* Automatically search for the init system */
3450
3451                                 l = 1 + argc - optind;
3452                                 a = newa(char*, l + 1);
3453                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3454
3455                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3456                                 execve(a[0], a, env_use);
3457
3458                                 a[0] = (char*) "/lib/systemd/systemd";
3459                                 execve(a[0], a, env_use);
3460
3461                                 a[0] = (char*) "/sbin/init";
3462                                 execve(a[0], a, env_use);
3463                         } else if (argc > optind)
3464                                 execvpe(argv[optind], argv + optind, env_use);
3465                         else {
3466                                 chdir(home ? home : "/root");
3467                                 execle("/bin/bash", "-bash", NULL, env_use);
3468                                 execle("/bin/sh", "-sh", NULL, env_use);
3469                         }
3470
3471                         log_error_errno(errno, "execv() failed: %m");
3472                         _exit(EXIT_FAILURE);
3473                 }
3474
3475                 barrier_set_role(&barrier, BARRIER_PARENT);
3476                 fdset_free(fds);
3477                 fds = NULL;
3478
3479                 /* wait for child-setup to be done */
3480                 if (barrier_place_and_sync(&barrier)) {
3481                         _cleanup_event_unref_ sd_event *event = NULL;
3482                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3483                         char last_char = 0;
3484                         int ifi = 0;
3485
3486                         r = move_network_interfaces(pid);
3487                         if (r < 0)
3488                                 goto finish;
3489
3490                         r = setup_veth(pid, veth_name, &ifi);
3491                         if (r < 0)
3492                                 goto finish;
3493
3494                         r = setup_bridge(veth_name, &ifi);
3495                         if (r < 0)
3496                                 goto finish;
3497
3498                         r = setup_macvlan(pid);
3499                         if (r < 0)
3500                                 goto finish;
3501
3502                         r = register_machine(pid, ifi);
3503                         if (r < 0)
3504                                 goto finish;
3505
3506                         /* Block SIGCHLD here, before notifying child.
3507                          * process_pty() will handle it with the other signals. */
3508                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3509                         if (r < 0)
3510                                 goto finish;
3511
3512                         /* Reset signal to default */
3513                         r = default_signals(SIGCHLD, -1);
3514                         if (r < 0)
3515                                 goto finish;
3516
3517                         /* Notify the child that the parent is ready with all
3518                          * its setup, and that the child can now hand over
3519                          * control to the code to run inside the container. */
3520                         (void)barrier_place(&barrier);
3521
3522                         r = sd_event_new(&event);
3523                         if (r < 0) {
3524                                 log_error_errno(r, "Failed to get default event source: %m");
3525                                 goto finish;
3526                         }
3527
3528                         if (arg_boot) {
3529                                 /* Try to kill the init system on SIGINT or SIGTERM */
3530                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3531                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3532                         } else {
3533                                 /* Immediately exit */
3534                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3535                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3536                         }
3537
3538                         /* simply exit on sigchld */
3539                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3540
3541                         r = pty_forward_new(event, master, true, &forward);
3542                         if (r < 0) {
3543                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3544                                 goto finish;
3545                         }
3546
3547                         r = sd_event_loop(event);
3548                         if (r < 0) {
3549                                 log_error_errno(r, "Failed to run event loop: %m");
3550                                 goto finish;
3551                         }
3552
3553                         pty_forward_last_char(forward, &last_char);
3554
3555                         forward = pty_forward_free(forward);
3556
3557                         if (!arg_quiet && last_char != '\n')
3558                                 putc('\n', stdout);
3559
3560                         /* Kill if it is not dead yet anyway */
3561                         terminate_machine(pid);
3562                 }
3563
3564                 /* Normally redundant, but better safe than sorry */
3565                 kill(pid, SIGKILL);
3566
3567                 r = wait_for_container(pid, &container_status);
3568                 pid = 0;
3569
3570                 if (r < 0)
3571                         /* We failed to wait for the container, or the
3572                          * container exited abnormally */
3573                         goto finish;
3574                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3575                         /* The container exited with a non-zero
3576                          * status, or with zero status and no reboot
3577                          * was requested. */
3578                         ret = r;
3579                         break;
3580                 }
3581
3582                 /* CONTAINER_REBOOTED, loop again */
3583
3584                 if (arg_keep_unit) {
3585                         /* Special handling if we are running as a
3586                          * service: instead of simply restarting the
3587                          * machine we want to restart the entire
3588                          * service, so let's inform systemd about this
3589                          * with the special exit code 133. The service
3590                          * file uses RestartForceExitStatus=133 so
3591                          * that this results in a full nspawn
3592                          * restart. This is necessary since we might
3593                          * have cgroup parameters set we want to have
3594                          * flushed out. */
3595                         ret = 133;
3596                         r = 0;
3597                         break;
3598                 }
3599         }
3600
3601 finish:
3602         sd_notify(false,
3603                   "STOPPING=1\n"
3604                   "STATUS=Terminating...");
3605
3606         loop_remove(loop_nr, &image_fd);
3607
3608         if (pid > 0)
3609                 kill(pid, SIGKILL);
3610
3611         if (remove_subvol && arg_directory) {
3612                 int k;
3613
3614                 k = btrfs_subvol_remove(arg_directory);
3615                 if (k < 0)
3616                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3617         }
3618
3619         if (arg_machine) {
3620                 const char *p;
3621
3622                 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
3623                 (void) rm_rf(p, false, true, false);
3624         }
3625
3626         free(arg_directory);
3627         free(arg_template);
3628         free(arg_image);
3629         free(arg_machine);
3630         free(arg_user);
3631         strv_free(arg_setenv);
3632         strv_free(arg_network_interfaces);
3633         strv_free(arg_network_macvlan);
3634         strv_free(arg_bind);
3635         strv_free(arg_bind_ro);
3636         strv_free(arg_tmpfs);
3637
3638         return r < 0 ? EXIT_FAILURE : ret;
3639 }