chiark / gitweb /
nspawn: allow spawning ephemeral nspawn containers based on the root file system...
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95
96 #ifdef HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99
100 typedef enum ContainerStatus {
101         CONTAINER_TERMINATED,
102         CONTAINER_REBOOTED
103 } ContainerStatus;
104
105 typedef enum LinkJournal {
106         LINK_NO,
107         LINK_AUTO,
108         LINK_HOST,
109         LINK_GUEST
110 } LinkJournal;
111
112 typedef enum Volatile {
113         VOLATILE_NO,
114         VOLATILE_YES,
115         VOLATILE_STATE,
116 } Volatile;
117
118 static char *arg_directory = NULL;
119 static char *arg_template = NULL;
120 static char *arg_user = NULL;
121 static sd_id128_t arg_uuid = {};
122 static char *arg_machine = NULL;
123 static const char *arg_selinux_context = NULL;
124 static const char *arg_selinux_apifs_context = NULL;
125 static const char *arg_slice = NULL;
126 static bool arg_private_network = false;
127 static bool arg_read_only = false;
128 static bool arg_boot = false;
129 static bool arg_ephemeral = false;
130 static LinkJournal arg_link_journal = LINK_AUTO;
131 static bool arg_link_journal_try = false;
132 static uint64_t arg_retain =
133         (1ULL << CAP_CHOWN) |
134         (1ULL << CAP_DAC_OVERRIDE) |
135         (1ULL << CAP_DAC_READ_SEARCH) |
136         (1ULL << CAP_FOWNER) |
137         (1ULL << CAP_FSETID) |
138         (1ULL << CAP_IPC_OWNER) |
139         (1ULL << CAP_KILL) |
140         (1ULL << CAP_LEASE) |
141         (1ULL << CAP_LINUX_IMMUTABLE) |
142         (1ULL << CAP_NET_BIND_SERVICE) |
143         (1ULL << CAP_NET_BROADCAST) |
144         (1ULL << CAP_NET_RAW) |
145         (1ULL << CAP_SETGID) |
146         (1ULL << CAP_SETFCAP) |
147         (1ULL << CAP_SETPCAP) |
148         (1ULL << CAP_SETUID) |
149         (1ULL << CAP_SYS_ADMIN) |
150         (1ULL << CAP_SYS_CHROOT) |
151         (1ULL << CAP_SYS_NICE) |
152         (1ULL << CAP_SYS_PTRACE) |
153         (1ULL << CAP_SYS_TTY_CONFIG) |
154         (1ULL << CAP_SYS_RESOURCE) |
155         (1ULL << CAP_SYS_BOOT) |
156         (1ULL << CAP_AUDIT_WRITE) |
157         (1ULL << CAP_AUDIT_CONTROL) |
158         (1ULL << CAP_MKNOD);
159 static char **arg_bind = NULL;
160 static char **arg_bind_ro = NULL;
161 static char **arg_tmpfs = NULL;
162 static char **arg_setenv = NULL;
163 static bool arg_quiet = false;
164 static bool arg_share_system = false;
165 static bool arg_register = true;
166 static bool arg_keep_unit = false;
167 static char **arg_network_interfaces = NULL;
168 static char **arg_network_macvlan = NULL;
169 static bool arg_network_veth = false;
170 static const char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = 0xffffffffLU;
172 static char *arg_image = NULL;
173 static Volatile arg_volatile = VOLATILE_NO;
174
175 static void help(void) {
176         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178                "  -h --help                 Show this help\n"
179                "     --version              Print version string\n"
180                "  -q --quiet                Do not show status information\n"
181                "  -D --directory=PATH       Root directory for the container\n"
182                "     --template=PATH        Initialize root directory from template directory,\n"
183                "                            if missing\n"
184                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
185                "                            remove it after exit\n"
186                "  -i --image=PATH           File system device or disk image for the container\n"
187                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
188                "  -u --user=USER            Run the command under specified user or uid\n"
189                "  -M --machine=NAME         Set the machine name for the container\n"
190                "     --uuid=UUID            Set a specific machine UUID for the container\n"
191                "  -S --slice=SLICE          Place the container in the specified slice\n"
192                "     --private-network      Disable network in container\n"
193                "     --network-interface=INTERFACE\n"
194                "                            Assign an existing network interface to the\n"
195                "                            container\n"
196                "     --network-macvlan=INTERFACE\n"
197                "                            Create a macvlan network interface based on an\n"
198                "                            existing network interface to the container\n"
199                "     --network-veth         Add a virtual ethernet connection between host\n"
200                "                            and container\n"
201                "     --network-bridge=INTERFACE\n"
202                "                            Add a virtual ethernet connection between host\n"
203                "                            and container and add it to an existing bridge on\n"
204                "                            the host\n"
205                "  -Z --selinux-context=SECLABEL\n"
206                "                            Set the SELinux security context to be used by\n"
207                "                            processes in the container\n"
208                "  -L --selinux-apifs-context=SECLABEL\n"
209                "                            Set the SELinux security context to be used by\n"
210                "                            API/tmpfs file systems in the container\n"
211                "     --capability=CAP       In addition to the default, retain specified\n"
212                "                            capability\n"
213                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
214                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
215                "                            try-guest, try-host\n"
216                "  -j                        Equivalent to --link-journal=try-guest\n"
217                "     --read-only            Mount the root directory read-only\n"
218                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
219                "                            the container\n"
220                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
221                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
222                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
223                "     --share-system         Share system namespaces with host\n"
224                "     --register=BOOLEAN     Register container as machine\n"
225                "     --keep-unit            Do not register a scope for the machine, reuse\n"
226                "                            the service unit nspawn is running in\n"
227                "     --volatile[=MODE]      Run the system in volatile mode\n",
228                program_invocation_short_name);
229 }
230
231 static int set_sanitized_path(char **b, const char *path) {
232         char *p;
233
234         assert(b);
235         assert(path);
236
237         p = canonicalize_file_name(path);
238         if (!p) {
239                 if (errno != ENOENT)
240                         return -errno;
241
242                 p = path_make_absolute_cwd(path);
243                 if (!p)
244                         return -ENOMEM;
245         }
246
247         free(*b);
248         *b = path_kill_slashes(p);
249         return 0;
250 }
251
252 static int parse_argv(int argc, char *argv[]) {
253
254         enum {
255                 ARG_VERSION = 0x100,
256                 ARG_PRIVATE_NETWORK,
257                 ARG_UUID,
258                 ARG_READ_ONLY,
259                 ARG_CAPABILITY,
260                 ARG_DROP_CAPABILITY,
261                 ARG_LINK_JOURNAL,
262                 ARG_BIND,
263                 ARG_BIND_RO,
264                 ARG_TMPFS,
265                 ARG_SETENV,
266                 ARG_SHARE_SYSTEM,
267                 ARG_REGISTER,
268                 ARG_KEEP_UNIT,
269                 ARG_NETWORK_INTERFACE,
270                 ARG_NETWORK_MACVLAN,
271                 ARG_NETWORK_VETH,
272                 ARG_NETWORK_BRIDGE,
273                 ARG_PERSONALITY,
274                 ARG_VOLATILE,
275                 ARG_TEMPLATE,
276         };
277
278         static const struct option options[] = {
279                 { "help",                  no_argument,       NULL, 'h'                   },
280                 { "version",               no_argument,       NULL, ARG_VERSION           },
281                 { "directory",             required_argument, NULL, 'D'                   },
282                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
283                 { "ephemeral",             no_argument,       NULL, 'x'                   },
284                 { "user",                  required_argument, NULL, 'u'                   },
285                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
286                 { "boot",                  no_argument,       NULL, 'b'                   },
287                 { "uuid",                  required_argument, NULL, ARG_UUID              },
288                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
289                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
290                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
291                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
292                 { "bind",                  required_argument, NULL, ARG_BIND              },
293                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
294                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
295                 { "machine",               required_argument, NULL, 'M'                   },
296                 { "slice",                 required_argument, NULL, 'S'                   },
297                 { "setenv",                required_argument, NULL, ARG_SETENV            },
298                 { "selinux-context",       required_argument, NULL, 'Z'                   },
299                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
300                 { "quiet",                 no_argument,       NULL, 'q'                   },
301                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
302                 { "register",              required_argument, NULL, ARG_REGISTER          },
303                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
304                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
305                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
306                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
307                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
308                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
309                 { "image",                 required_argument, NULL, 'i'                   },
310                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
311                 {}
312         };
313
314         int c, r;
315         uint64_t plus = 0, minus = 0;
316
317         assert(argc >= 0);
318         assert(argv);
319
320         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
321
322                 switch (c) {
323
324                 case 'h':
325                         help();
326                         return 0;
327
328                 case ARG_VERSION:
329                         puts(PACKAGE_STRING);
330                         puts(SYSTEMD_FEATURES);
331                         return 0;
332
333                 case 'D':
334                         r = set_sanitized_path(&arg_directory, optarg);
335                         if (r < 0)
336                                 return log_error_errno(r, "Invalid root directory: %m");
337
338                         break;
339
340                 case ARG_TEMPLATE:
341                         r = set_sanitized_path(&arg_template, optarg);
342                         if (r < 0)
343                                 return log_error_errno(r, "Invalid template directory: %m");
344
345                         break;
346
347                 case 'i':
348                         r = set_sanitized_path(&arg_image, optarg);
349                         if (r < 0)
350                                 return log_error_errno(r, "Invalid image path: %m");
351
352                         break;
353
354                 case 'x':
355                         arg_ephemeral = true;
356                         break;
357
358                 case 'u':
359                         free(arg_user);
360                         arg_user = strdup(optarg);
361                         if (!arg_user)
362                                 return log_oom();
363
364                         break;
365
366                 case ARG_NETWORK_BRIDGE:
367                         arg_network_bridge = optarg;
368
369                         /* fall through */
370
371                 case ARG_NETWORK_VETH:
372                         arg_network_veth = true;
373                         arg_private_network = true;
374                         break;
375
376                 case ARG_NETWORK_INTERFACE:
377                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
378                                 return log_oom();
379
380                         arg_private_network = true;
381                         break;
382
383                 case ARG_NETWORK_MACVLAN:
384                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
385                                 return log_oom();
386
387                         /* fall through */
388
389                 case ARG_PRIVATE_NETWORK:
390                         arg_private_network = true;
391                         break;
392
393                 case 'b':
394                         arg_boot = true;
395                         break;
396
397                 case ARG_UUID:
398                         r = sd_id128_from_string(optarg, &arg_uuid);
399                         if (r < 0) {
400                                 log_error("Invalid UUID: %s", optarg);
401                                 return r;
402                         }
403                         break;
404
405                 case 'S':
406                         arg_slice = optarg;
407                         break;
408
409                 case 'M':
410                         if (isempty(optarg)) {
411                                 free(arg_machine);
412                                 arg_machine = NULL;
413                         } else {
414                                 if (!machine_name_is_valid(optarg)) {
415                                         log_error("Invalid machine name: %s", optarg);
416                                         return -EINVAL;
417                                 }
418
419                                 r = free_and_strdup(&arg_machine, optarg);
420                                 if (r < 0)
421                                         return log_oom();
422
423                                 break;
424                         }
425
426                 case 'Z':
427                         arg_selinux_context = optarg;
428                         break;
429
430                 case 'L':
431                         arg_selinux_apifs_context = optarg;
432                         break;
433
434                 case ARG_READ_ONLY:
435                         arg_read_only = true;
436                         break;
437
438                 case ARG_CAPABILITY:
439                 case ARG_DROP_CAPABILITY: {
440                         const char *state, *word;
441                         size_t length;
442
443                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
444                                 _cleanup_free_ char *t;
445
446                                 t = strndup(word, length);
447                                 if (!t)
448                                         return log_oom();
449
450                                 if (streq(t, "all")) {
451                                         if (c == ARG_CAPABILITY)
452                                                 plus = (uint64_t) -1;
453                                         else
454                                                 minus = (uint64_t) -1;
455                                 } else {
456                                         int cap;
457
458                                         cap = capability_from_name(t);
459                                         if (cap < 0) {
460                                                 log_error("Failed to parse capability %s.", t);
461                                                 return -EINVAL;
462                                         }
463
464                                         if (c == ARG_CAPABILITY)
465                                                 plus |= 1ULL << (uint64_t) cap;
466                                         else
467                                                 minus |= 1ULL << (uint64_t) cap;
468                                 }
469                         }
470
471                         break;
472                 }
473
474                 case 'j':
475                         arg_link_journal = LINK_GUEST;
476                         arg_link_journal_try = true;
477                         break;
478
479                 case ARG_LINK_JOURNAL:
480                         if (streq(optarg, "auto")) {
481                                 arg_link_journal = LINK_AUTO;
482                                 arg_link_journal_try = false;
483                         } else if (streq(optarg, "no")) {
484                                 arg_link_journal = LINK_NO;
485                                 arg_link_journal_try = false;
486                         } else if (streq(optarg, "guest")) {
487                                 arg_link_journal = LINK_GUEST;
488                                 arg_link_journal_try = false;
489                         } else if (streq(optarg, "host")) {
490                                 arg_link_journal = LINK_HOST;
491                                 arg_link_journal_try = false;
492                         } else if (streq(optarg, "try-guest")) {
493                                 arg_link_journal = LINK_GUEST;
494                                 arg_link_journal_try = true;
495                         } else if (streq(optarg, "try-host")) {
496                                 arg_link_journal = LINK_HOST;
497                                 arg_link_journal_try = true;
498                         } else {
499                                 log_error("Failed to parse link journal mode %s", optarg);
500                                 return -EINVAL;
501                         }
502
503                         break;
504
505                 case ARG_BIND:
506                 case ARG_BIND_RO: {
507                         _cleanup_free_ char *a = NULL, *b = NULL;
508                         char *e;
509                         char ***x;
510
511                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
512
513                         e = strchr(optarg, ':');
514                         if (e) {
515                                 a = strndup(optarg, e - optarg);
516                                 b = strdup(e + 1);
517                         } else {
518                                 a = strdup(optarg);
519                                 b = strdup(optarg);
520                         }
521
522                         if (!a || !b)
523                                 return log_oom();
524
525                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
526                                 log_error("Invalid bind mount specification: %s", optarg);
527                                 return -EINVAL;
528                         }
529
530                         r = strv_extend(x, a);
531                         if (r < 0)
532                                 return log_oom();
533
534                         r = strv_extend(x, b);
535                         if (r < 0)
536                                 return log_oom();
537
538                         break;
539                 }
540
541                 case ARG_TMPFS: {
542                         _cleanup_free_ char *a = NULL, *b = NULL;
543                         char *e;
544
545                         e = strchr(optarg, ':');
546                         if (e) {
547                                 a = strndup(optarg, e - optarg);
548                                 b = strdup(e + 1);
549                         } else {
550                                 a = strdup(optarg);
551                                 b = strdup("mode=0755");
552                         }
553
554                         if (!a || !b)
555                                 return log_oom();
556
557                         if (!path_is_absolute(a)) {
558                                 log_error("Invalid tmpfs specification: %s", optarg);
559                                 return -EINVAL;
560                         }
561
562                         r = strv_push(&arg_tmpfs, a);
563                         if (r < 0)
564                                 return log_oom();
565
566                         a = NULL;
567
568                         r = strv_push(&arg_tmpfs, b);
569                         if (r < 0)
570                                 return log_oom();
571
572                         b = NULL;
573
574                         break;
575                 }
576
577                 case ARG_SETENV: {
578                         char **n;
579
580                         if (!env_assignment_is_valid(optarg)) {
581                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
582                                 return -EINVAL;
583                         }
584
585                         n = strv_env_set(arg_setenv, optarg);
586                         if (!n)
587                                 return log_oom();
588
589                         strv_free(arg_setenv);
590                         arg_setenv = n;
591                         break;
592                 }
593
594                 case 'q':
595                         arg_quiet = true;
596                         break;
597
598                 case ARG_SHARE_SYSTEM:
599                         arg_share_system = true;
600                         break;
601
602                 case ARG_REGISTER:
603                         r = parse_boolean(optarg);
604                         if (r < 0) {
605                                 log_error("Failed to parse --register= argument: %s", optarg);
606                                 return r;
607                         }
608
609                         arg_register = r;
610                         break;
611
612                 case ARG_KEEP_UNIT:
613                         arg_keep_unit = true;
614                         break;
615
616                 case ARG_PERSONALITY:
617
618                         arg_personality = personality_from_string(optarg);
619                         if (arg_personality == 0xffffffffLU) {
620                                 log_error("Unknown or unsupported personality '%s'.", optarg);
621                                 return -EINVAL;
622                         }
623
624                         break;
625
626                 case ARG_VOLATILE:
627
628                         if (!optarg)
629                                 arg_volatile = VOLATILE_YES;
630                         else {
631                                 r = parse_boolean(optarg);
632                                 if (r < 0) {
633                                         if (streq(optarg, "state"))
634                                                 arg_volatile = VOLATILE_STATE;
635                                         else {
636                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
637                                                 return r;
638                                         }
639                                 } else
640                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
641                         }
642
643                         break;
644
645                 case '?':
646                         return -EINVAL;
647
648                 default:
649                         assert_not_reached("Unhandled option");
650                 }
651
652         if (arg_share_system)
653                 arg_register = false;
654
655         if (arg_boot && arg_share_system) {
656                 log_error("--boot and --share-system may not be combined.");
657                 return -EINVAL;
658         }
659
660         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
661                 log_error("--keep-unit may not be used when invoked from a user session.");
662                 return -EINVAL;
663         }
664
665         if (arg_directory && arg_image) {
666                 log_error("--directory= and --image= may not be combined.");
667                 return -EINVAL;
668         }
669
670         if (arg_template && arg_image) {
671                 log_error("--template= and --image= may not be combined.");
672                 return -EINVAL;
673         }
674
675         if (arg_template && !(arg_directory || arg_machine)) {
676                 log_error("--template= needs --directory= or --machine=.");
677                 return -EINVAL;
678         }
679
680         if (arg_ephemeral && arg_template) {
681                 log_error("--ephemeral and --template= may not be combined.");
682                 return -EINVAL;
683         }
684
685         if (arg_ephemeral && arg_image) {
686                 log_error("--ephemeral and --image= may not be combined.");
687                 return -EINVAL;
688         }
689
690         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
691                 log_error("--ephemeral and --link-journal= may not be combined.");
692                 return -EINVAL;
693         }
694
695         if (arg_volatile != VOLATILE_NO && arg_read_only) {
696                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
697                 return -EINVAL;
698         }
699
700         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
701
702         return 1;
703 }
704
705 static int mount_all(const char *dest) {
706
707         typedef struct MountPoint {
708                 const char *what;
709                 const char *where;
710                 const char *type;
711                 const char *options;
712                 unsigned long flags;
713                 bool fatal;
714         } MountPoint;
715
716         static const MountPoint mount_table[] = {
717                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
718                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
719                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
720                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
721                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
722                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
723                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
724                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
725 #ifdef HAVE_SELINUX
726                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
727                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
728 #endif
729         };
730
731         unsigned k;
732         int r = 0;
733
734         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
735                 _cleanup_free_ char *where = NULL;
736 #ifdef HAVE_SELINUX
737                 _cleanup_free_ char *options = NULL;
738 #endif
739                 const char *o;
740                 int t;
741
742                 where = strjoin(dest, "/", mount_table[k].where, NULL);
743                 if (!where)
744                         return log_oom();
745
746                 t = path_is_mount_point(where, true);
747                 if (t < 0) {
748                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
749
750                         if (r == 0)
751                                 r = t;
752
753                         continue;
754                 }
755
756                 /* Skip this entry if it is not a remount. */
757                 if (mount_table[k].what && t > 0)
758                         continue;
759
760                 t = mkdir_p(where, 0755);
761                 if (t < 0) {
762                         if (mount_table[k].fatal) {
763                                log_error_errno(t, "Failed to create directory %s: %m", where);
764
765                                 if (r == 0)
766                                         r = t;
767                         } else
768                                log_warning_errno(t, "Failed to create directory %s: %m", where);
769
770                         continue;
771                 }
772
773 #ifdef HAVE_SELINUX
774                 if (arg_selinux_apifs_context &&
775                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
776                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
777                         if (!options)
778                                 return log_oom();
779
780                         o = options;
781                 } else
782 #endif
783                         o = mount_table[k].options;
784
785
786                 if (mount(mount_table[k].what,
787                           where,
788                           mount_table[k].type,
789                           mount_table[k].flags,
790                           o) < 0) {
791
792                         if (mount_table[k].fatal) {
793                                 log_error_errno(errno, "mount(%s) failed: %m", where);
794
795                                 if (r == 0)
796                                         r = -errno;
797                         } else
798                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
799                 }
800         }
801
802         return r;
803 }
804
805 static int mount_binds(const char *dest, char **l, bool ro) {
806         char **x, **y;
807
808         STRV_FOREACH_PAIR(x, y, l) {
809                 _cleanup_free_ char *where = NULL;
810                 struct stat source_st, dest_st;
811                 int r;
812
813                 if (stat(*x, &source_st) < 0)
814                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
815
816                 where = strappend(dest, *y);
817                 if (!where)
818                         return log_oom();
819
820                 r = stat(where, &dest_st);
821                 if (r == 0) {
822                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
823                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
824                                 return -EINVAL;
825                         }
826                 } else if (errno == ENOENT) {
827                         r = mkdir_parents_label(where, 0755);
828                         if (r < 0)
829                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
830                 } else {
831                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
832                         return -errno;
833                 }
834
835                 /* Create the mount point, but be conservative -- refuse to create block
836                  * and char devices. */
837                 if (S_ISDIR(source_st.st_mode)) {
838                         r = mkdir_label(where, 0755);
839                         if (r < 0 && errno != EEXIST)
840                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
841                 } else if (S_ISFIFO(source_st.st_mode)) {
842                         r = mkfifo(where, 0644);
843                         if (r < 0 && errno != EEXIST)
844                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
845                 } else if (S_ISSOCK(source_st.st_mode)) {
846                         r = mknod(where, 0644 | S_IFSOCK, 0);
847                         if (r < 0 && errno != EEXIST)
848                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
849                 } else if (S_ISREG(source_st.st_mode)) {
850                         r = touch(where);
851                         if (r < 0)
852                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
853                 } else {
854                         log_error("Refusing to create mountpoint for file: %s", *x);
855                         return -ENOTSUP;
856                 }
857
858                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
859                         return log_error_errno(errno, "mount(%s) failed: %m", where);
860
861                 if (ro) {
862                         r = bind_remount_recursive(where, true);
863                         if (r < 0)
864                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
865                 }
866         }
867
868         return 0;
869 }
870
871 static int mount_tmpfs(const char *dest) {
872         char **i, **o;
873
874         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
875                 _cleanup_free_ char *where = NULL;
876                 int r;
877
878                 where = strappend(dest, *i);
879                 if (!where)
880                         return log_oom();
881
882                 r = mkdir_label(where, 0755);
883                 if (r < 0 && r != -EEXIST)
884                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
885
886                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
887                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
888         }
889
890         return 0;
891 }
892
893 static int setup_timezone(const char *dest) {
894         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
895         char *z, *y;
896         int r;
897
898         assert(dest);
899
900         /* Fix the timezone, if possible */
901         r = readlink_malloc("/etc/localtime", &p);
902         if (r < 0) {
903                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
904                 return 0;
905         }
906
907         z = path_startswith(p, "../usr/share/zoneinfo/");
908         if (!z)
909                 z = path_startswith(p, "/usr/share/zoneinfo/");
910         if (!z) {
911                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
912                 return 0;
913         }
914
915         where = strappend(dest, "/etc/localtime");
916         if (!where)
917                 return log_oom();
918
919         r = readlink_malloc(where, &q);
920         if (r >= 0) {
921                 y = path_startswith(q, "../usr/share/zoneinfo/");
922                 if (!y)
923                         y = path_startswith(q, "/usr/share/zoneinfo/");
924
925                 /* Already pointing to the right place? Then do nothing .. */
926                 if (y && streq(y, z))
927                         return 0;
928         }
929
930         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
931         if (!check)
932                 return log_oom();
933
934         if (access(check, F_OK) < 0) {
935                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
936                 return 0;
937         }
938
939         what = strappend("../usr/share/zoneinfo/", z);
940         if (!what)
941                 return log_oom();
942
943         r = mkdir_parents(where, 0755);
944         if (r < 0) {
945                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
946
947                 return 0;
948         }
949
950         r = unlink(where);
951         if (r < 0 && errno != ENOENT) {
952                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
953
954                 return 0;
955         }
956
957         if (symlink(what, where) < 0) {
958                 log_error_errno(errno, "Failed to correct timezone of container: %m");
959                 return 0;
960         }
961
962         return 0;
963 }
964
965 static int setup_resolv_conf(const char *dest) {
966         _cleanup_free_ char *where = NULL;
967         int r;
968
969         assert(dest);
970
971         if (arg_private_network)
972                 return 0;
973
974         /* Fix resolv.conf, if possible */
975         where = strappend(dest, "/etc/resolv.conf");
976         if (!where)
977                 return log_oom();
978
979         /* We don't really care for the results of this really. If it
980          * fails, it fails, but meh... */
981         r = mkdir_parents(where, 0755);
982         if (r < 0) {
983                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
984
985                 return 0;
986         }
987
988         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
989         if (r < 0) {
990                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
991
992                 return 0;
993         }
994
995         return 0;
996 }
997
998 static int setup_volatile_state(const char *directory) {
999         const char *p;
1000         int r;
1001
1002         assert(directory);
1003
1004         if (arg_volatile != VOLATILE_STATE)
1005                 return 0;
1006
1007         /* --volatile=state means we simply overmount /var
1008            with a tmpfs, and the rest read-only. */
1009
1010         r = bind_remount_recursive(directory, true);
1011         if (r < 0)
1012                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1013
1014         p = strappenda(directory, "/var");
1015         r = mkdir(p, 0755);
1016         if (r < 0 && errno != EEXIST)
1017                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1018
1019         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1020                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1021
1022         return 0;
1023 }
1024
1025 static int setup_volatile(const char *directory) {
1026         bool tmpfs_mounted = false, bind_mounted = false;
1027         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1028         const char *f, *t;
1029         int r;
1030
1031         assert(directory);
1032
1033         if (arg_volatile != VOLATILE_YES)
1034                 return 0;
1035
1036         /* --volatile=yes means we mount a tmpfs to the root dir, and
1037            the original /usr to use inside it, and that read-only. */
1038
1039         if (!mkdtemp(template))
1040                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1041
1042         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1043                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1044                 r = -errno;
1045                 goto fail;
1046         }
1047
1048         tmpfs_mounted = true;
1049
1050         f = strappenda(directory, "/usr");
1051         t = strappenda(template, "/usr");
1052
1053         r = mkdir(t, 0755);
1054         if (r < 0 && errno != EEXIST) {
1055                 log_error_errno(errno, "Failed to create %s: %m", t);
1056                 r = -errno;
1057                 goto fail;
1058         }
1059
1060         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1061                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1062                 r = -errno;
1063                 goto fail;
1064         }
1065
1066         bind_mounted = true;
1067
1068         r = bind_remount_recursive(t, true);
1069         if (r < 0) {
1070                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1071                 goto fail;
1072         }
1073
1074         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1075                 log_error_errno(errno, "Failed to move root mount: %m");
1076                 r = -errno;
1077                 goto fail;
1078         }
1079
1080         rmdir(template);
1081
1082         return 0;
1083
1084 fail:
1085         if (bind_mounted)
1086                 umount(t);
1087         if (tmpfs_mounted)
1088                 umount(template);
1089         rmdir(template);
1090         return r;
1091 }
1092
1093 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1094
1095         snprintf(s, 37,
1096                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1097                  SD_ID128_FORMAT_VAL(id));
1098
1099         return s;
1100 }
1101
1102 static int setup_boot_id(const char *dest) {
1103         _cleanup_free_ char *from = NULL, *to = NULL;
1104         sd_id128_t rnd = {};
1105         char as_uuid[37];
1106         int r;
1107
1108         assert(dest);
1109
1110         if (arg_share_system)
1111                 return 0;
1112
1113         /* Generate a new randomized boot ID, so that each boot-up of
1114          * the container gets a new one */
1115
1116         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1117         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1118         if (!from || !to)
1119                 return log_oom();
1120
1121         r = sd_id128_randomize(&rnd);
1122         if (r < 0)
1123                 return log_error_errno(r, "Failed to generate random boot id: %m");
1124
1125         id128_format_as_uuid(rnd, as_uuid);
1126
1127         r = write_string_file(from, as_uuid);
1128         if (r < 0)
1129                 return log_error_errno(r, "Failed to write boot id: %m");
1130
1131         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1132                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1133                 r = -errno;
1134         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1135                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1136
1137         unlink(from);
1138         return r;
1139 }
1140
1141 static int copy_devnodes(const char *dest) {
1142
1143         static const char devnodes[] =
1144                 "null\0"
1145                 "zero\0"
1146                 "full\0"
1147                 "random\0"
1148                 "urandom\0"
1149                 "tty\0"
1150                 "net/tun\0";
1151
1152         const char *d;
1153         int r = 0;
1154         _cleanup_umask_ mode_t u;
1155
1156         assert(dest);
1157
1158         u = umask(0000);
1159
1160         NULSTR_FOREACH(d, devnodes) {
1161                 _cleanup_free_ char *from = NULL, *to = NULL;
1162                 struct stat st;
1163
1164                 from = strappend("/dev/", d);
1165                 to = strjoin(dest, "/dev/", d, NULL);
1166                 if (!from || !to)
1167                         return log_oom();
1168
1169                 if (stat(from, &st) < 0) {
1170
1171                         if (errno != ENOENT)
1172                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1173
1174                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1175
1176                         log_error("%s is not a char or block device, cannot copy", from);
1177                         return -EIO;
1178
1179                 } else {
1180                         r = mkdir_parents(to, 0775);
1181                         if (r < 0) {
1182                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1183                                 return -r;
1184                         }
1185
1186                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1187                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1188                 }
1189         }
1190
1191         return r;
1192 }
1193
1194 static int setup_ptmx(const char *dest) {
1195         _cleanup_free_ char *p = NULL;
1196
1197         p = strappend(dest, "/dev/ptmx");
1198         if (!p)
1199                 return log_oom();
1200
1201         if (symlink("pts/ptmx", p) < 0)
1202                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1203
1204         return 0;
1205 }
1206
1207 static int setup_dev_console(const char *dest, const char *console) {
1208         _cleanup_umask_ mode_t u;
1209         const char *to;
1210         struct stat st;
1211         int r;
1212
1213         assert(dest);
1214         assert(console);
1215
1216         u = umask(0000);
1217
1218         if (stat("/dev/null", &st) < 0)
1219                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1220
1221         r = chmod_and_chown(console, 0600, 0, 0);
1222         if (r < 0)
1223                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1224
1225         /* We need to bind mount the right tty to /dev/console since
1226          * ptys can only exist on pts file systems. To have something
1227          * to bind mount things on we create a device node first, and
1228          * use /dev/null for that since we the cgroups device policy
1229          * allows us to create that freely, while we cannot create
1230          * /dev/console. (Note that the major minor doesn't actually
1231          * matter here, since we mount it over anyway). */
1232
1233         to = strappenda(dest, "/dev/console");
1234         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1235                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1236
1237         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1238                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1239
1240         return 0;
1241 }
1242
1243 static int setup_kmsg(const char *dest, int kmsg_socket) {
1244         _cleanup_free_ char *from = NULL, *to = NULL;
1245         int r, fd, k;
1246         _cleanup_umask_ mode_t u;
1247         union {
1248                 struct cmsghdr cmsghdr;
1249                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1250         } control = {};
1251         struct msghdr mh = {
1252                 .msg_control = &control,
1253                 .msg_controllen = sizeof(control),
1254         };
1255         struct cmsghdr *cmsg;
1256
1257         assert(dest);
1258         assert(kmsg_socket >= 0);
1259
1260         u = umask(0000);
1261
1262         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1263          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1264          * on the reading side behave very similar to /proc/kmsg,
1265          * their writing side behaves differently from /dev/kmsg in
1266          * that writing blocks when nothing is reading. In order to
1267          * avoid any problems with containers deadlocking due to this
1268          * we simply make /dev/kmsg unavailable to the container. */
1269         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1270             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1271                 return log_oom();
1272
1273         if (mkfifo(from, 0600) < 0)
1274                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1275
1276         r = chmod_and_chown(from, 0600, 0, 0);
1277         if (r < 0)
1278                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1279
1280         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1281                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1282
1283         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1284         if (fd < 0)
1285                 return log_error_errno(errno, "Failed to open fifo: %m");
1286
1287         cmsg = CMSG_FIRSTHDR(&mh);
1288         cmsg->cmsg_level = SOL_SOCKET;
1289         cmsg->cmsg_type = SCM_RIGHTS;
1290         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1291         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1292
1293         mh.msg_controllen = cmsg->cmsg_len;
1294
1295         /* Store away the fd in the socket, so that it stays open as
1296          * long as we run the child */
1297         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1298         safe_close(fd);
1299
1300         if (k < 0)
1301                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1302
1303         /* And now make the FIFO unavailable as /dev/kmsg... */
1304         unlink(from);
1305         return 0;
1306 }
1307
1308 static int setup_hostname(void) {
1309
1310         if (arg_share_system)
1311                 return 0;
1312
1313         if (sethostname_idempotent(arg_machine) < 0)
1314                 return -errno;
1315
1316         return 0;
1317 }
1318
1319 static int setup_journal(const char *directory) {
1320         sd_id128_t machine_id, this_id;
1321         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1322         char *id;
1323         int r;
1324
1325         /* Don't link journals in ephemeral mode */
1326         if (arg_ephemeral)
1327                 return 0;
1328
1329         p = strappend(directory, "/etc/machine-id");
1330         if (!p)
1331                 return log_oom();
1332
1333         r = read_one_line_file(p, &b);
1334         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1335                 return 0;
1336         else if (r < 0)
1337                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1338
1339         id = strstrip(b);
1340         if (isempty(id) && arg_link_journal == LINK_AUTO)
1341                 return 0;
1342
1343         /* Verify validity */
1344         r = sd_id128_from_string(id, &machine_id);
1345         if (r < 0)
1346                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1347
1348         r = sd_id128_get_machine(&this_id);
1349         if (r < 0)
1350                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1351
1352         if (sd_id128_equal(machine_id, this_id)) {
1353                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1354                          "Host and machine ids are equal (%s): refusing to link journals", id);
1355                 if (arg_link_journal == LINK_AUTO)
1356                         return 0;
1357                 return -EEXIST;
1358         }
1359
1360         if (arg_link_journal == LINK_NO)
1361                 return 0;
1362
1363         free(p);
1364         p = strappend("/var/log/journal/", id);
1365         q = strjoin(directory, "/var/log/journal/", id, NULL);
1366         if (!p || !q)
1367                 return log_oom();
1368
1369         if (path_is_mount_point(p, false) > 0) {
1370                 if (arg_link_journal != LINK_AUTO) {
1371                         log_error("%s: already a mount point, refusing to use for journal", p);
1372                         return -EEXIST;
1373                 }
1374
1375                 return 0;
1376         }
1377
1378         if (path_is_mount_point(q, false) > 0) {
1379                 if (arg_link_journal != LINK_AUTO) {
1380                         log_error("%s: already a mount point, refusing to use for journal", q);
1381                         return -EEXIST;
1382                 }
1383
1384                 return 0;
1385         }
1386
1387         r = readlink_and_make_absolute(p, &d);
1388         if (r >= 0) {
1389                 if ((arg_link_journal == LINK_GUEST ||
1390                      arg_link_journal == LINK_AUTO) &&
1391                     path_equal(d, q)) {
1392
1393                         r = mkdir_p(q, 0755);
1394                         if (r < 0)
1395                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1396                         return 0;
1397                 }
1398
1399                 if (unlink(p) < 0)
1400                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1401         } else if (r == -EINVAL) {
1402
1403                 if (arg_link_journal == LINK_GUEST &&
1404                     rmdir(p) < 0) {
1405
1406                         if (errno == ENOTDIR) {
1407                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1408                                 return r;
1409                         } else {
1410                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1411                                 return -errno;
1412                         }
1413                 }
1414         } else if (r != -ENOENT) {
1415                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1416                 return r;
1417         }
1418
1419         if (arg_link_journal == LINK_GUEST) {
1420
1421                 if (symlink(q, p) < 0) {
1422                         if (arg_link_journal_try) {
1423                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1424                                 return 0;
1425                         } else {
1426                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1427                                 return -errno;
1428                         }
1429                 }
1430
1431                 r = mkdir_p(q, 0755);
1432                 if (r < 0)
1433                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1434                 return 0;
1435         }
1436
1437         if (arg_link_journal == LINK_HOST) {
1438                 /* don't create parents here -- if the host doesn't have
1439                  * permanent journal set up, don't force it here */
1440                 r = mkdir(p, 0755);
1441                 if (r < 0) {
1442                         if (arg_link_journal_try) {
1443                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1444                                 return 0;
1445                         } else {
1446                                 log_error_errno(errno, "Failed to create %s: %m", p);
1447                                 return r;
1448                         }
1449                 }
1450
1451         } else if (access(p, F_OK) < 0)
1452                 return 0;
1453
1454         if (dir_is_empty(q) == 0)
1455                 log_warning("%s is not empty, proceeding anyway.", q);
1456
1457         r = mkdir_p(q, 0755);
1458         if (r < 0) {
1459                 log_error_errno(errno, "Failed to create %s: %m", q);
1460                 return r;
1461         }
1462
1463         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1464                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1465
1466         return 0;
1467 }
1468
1469 static int drop_capabilities(void) {
1470         return capability_bounding_set_drop(~arg_retain, false);
1471 }
1472
1473 static int register_machine(pid_t pid, int local_ifindex) {
1474         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1475         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1476         int r;
1477
1478         if (!arg_register)
1479                 return 0;
1480
1481         r = sd_bus_default_system(&bus);
1482         if (r < 0)
1483                 return log_error_errno(r, "Failed to open system bus: %m");
1484
1485         if (arg_keep_unit) {
1486                 r = sd_bus_call_method(
1487                                 bus,
1488                                 "org.freedesktop.machine1",
1489                                 "/org/freedesktop/machine1",
1490                                 "org.freedesktop.machine1.Manager",
1491                                 "RegisterMachineWithNetwork",
1492                                 &error,
1493                                 NULL,
1494                                 "sayssusai",
1495                                 arg_machine,
1496                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1497                                 "nspawn",
1498                                 "container",
1499                                 (uint32_t) pid,
1500                                 strempty(arg_directory),
1501                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1502         } else {
1503                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1504
1505                 r = sd_bus_message_new_method_call(
1506                                 bus,
1507                                 &m,
1508                                 "org.freedesktop.machine1",
1509                                 "/org/freedesktop/machine1",
1510                                 "org.freedesktop.machine1.Manager",
1511                                 "CreateMachineWithNetwork");
1512                 if (r < 0)
1513                         return log_error_errno(r, "Failed to create message: %m");
1514
1515                 r = sd_bus_message_append(
1516                                 m,
1517                                 "sayssusai",
1518                                 arg_machine,
1519                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1520                                 "nspawn",
1521                                 "container",
1522                                 (uint32_t) pid,
1523                                 strempty(arg_directory),
1524                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1525                 if (r < 0)
1526                         return log_error_errno(r, "Failed to append message arguments: %m");
1527
1528                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1529                 if (r < 0)
1530                         return log_error_errno(r, "Failed to open container: %m");
1531
1532                 if (!isempty(arg_slice)) {
1533                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1534                         if (r < 0)
1535                                 return log_error_errno(r, "Failed to append slice: %m");
1536                 }
1537
1538                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1539                 if (r < 0)
1540                         return log_error_errno(r, "Failed to add device policy: %m");
1541
1542                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1543                                           /* Allow the container to
1544                                            * access and create the API
1545                                            * device nodes, so that
1546                                            * PrivateDevices= in the
1547                                            * container can work
1548                                            * fine */
1549                                           "/dev/null", "rwm",
1550                                           "/dev/zero", "rwm",
1551                                           "/dev/full", "rwm",
1552                                           "/dev/random", "rwm",
1553                                           "/dev/urandom", "rwm",
1554                                           "/dev/tty", "rwm",
1555                                           "/dev/net/tun", "rwm",
1556                                           /* Allow the container
1557                                            * access to ptys. However,
1558                                            * do not permit the
1559                                            * container to ever create
1560                                            * these device nodes. */
1561                                           "/dev/pts/ptmx", "rw",
1562                                           "char-pts", "rw");
1563                 if (r < 0)
1564                         return log_error_errno(r, "Failed to add device whitelist: %m");
1565
1566                 r = sd_bus_message_close_container(m);
1567                 if (r < 0)
1568                         return log_error_errno(r, "Failed to close container: %m");
1569
1570                 r = sd_bus_call(bus, m, 0, &error, NULL);
1571         }
1572
1573         if (r < 0) {
1574                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1575                 return r;
1576         }
1577
1578         return 0;
1579 }
1580
1581 static int terminate_machine(pid_t pid) {
1582         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1583         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1584         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1585         const char *path;
1586         int r;
1587
1588         if (!arg_register)
1589                 return 0;
1590
1591         r = sd_bus_default_system(&bus);
1592         if (r < 0)
1593                 return log_error_errno(r, "Failed to open system bus: %m");
1594
1595         r = sd_bus_call_method(
1596                         bus,
1597                         "org.freedesktop.machine1",
1598                         "/org/freedesktop/machine1",
1599                         "org.freedesktop.machine1.Manager",
1600                         "GetMachineByPID",
1601                         &error,
1602                         &reply,
1603                         "u",
1604                         (uint32_t) pid);
1605         if (r < 0) {
1606                 /* Note that the machine might already have been
1607                  * cleaned up automatically, hence don't consider it a
1608                  * failure if we cannot get the machine object. */
1609                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1610                 return 0;
1611         }
1612
1613         r = sd_bus_message_read(reply, "o", &path);
1614         if (r < 0)
1615                 return bus_log_parse_error(r);
1616
1617         r = sd_bus_call_method(
1618                         bus,
1619                         "org.freedesktop.machine1",
1620                         path,
1621                         "org.freedesktop.machine1.Machine",
1622                         "Terminate",
1623                         &error,
1624                         NULL,
1625                         NULL);
1626         if (r < 0) {
1627                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1628                 return 0;
1629         }
1630
1631         return 0;
1632 }
1633
1634 static int reset_audit_loginuid(void) {
1635         _cleanup_free_ char *p = NULL;
1636         int r;
1637
1638         if (arg_share_system)
1639                 return 0;
1640
1641         r = read_one_line_file("/proc/self/loginuid", &p);
1642         if (r == -ENOENT)
1643                 return 0;
1644         if (r < 0)
1645                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1646
1647         /* Already reset? */
1648         if (streq(p, "4294967295"))
1649                 return 0;
1650
1651         r = write_string_file("/proc/self/loginuid", "4294967295");
1652         if (r < 0) {
1653                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1654                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1655                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1656                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1657                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1658
1659                 sleep(5);
1660         }
1661
1662         return 0;
1663 }
1664
1665 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1666 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1667 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1668
1669 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1670         uint8_t result[8];
1671         size_t l, sz;
1672         uint8_t *v, *i;
1673         int r;
1674
1675         l = strlen(arg_machine);
1676         sz = sizeof(sd_id128_t) + l;
1677         if (idx > 0)
1678                 sz += sizeof(idx);
1679
1680         v = alloca(sz);
1681
1682         /* fetch some persistent data unique to the host */
1683         r = sd_id128_get_machine((sd_id128_t*) v);
1684         if (r < 0)
1685                 return r;
1686
1687         /* combine with some data unique (on this host) to this
1688          * container instance */
1689         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1690         if (idx > 0) {
1691                 idx = htole64(idx);
1692                 memcpy(i, &idx, sizeof(idx));
1693         }
1694
1695         /* Let's hash the host machine ID plus the container name. We
1696          * use a fixed, but originally randomly created hash key here. */
1697         siphash24(result, v, sz, hash_key.bytes);
1698
1699         assert_cc(ETH_ALEN <= sizeof(result));
1700         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1701
1702         /* see eth_random_addr in the kernel */
1703         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1704         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1705
1706         return 0;
1707 }
1708
1709 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1710         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1711         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1712         struct ether_addr mac_host, mac_container;
1713         int r, i;
1714
1715         if (!arg_private_network)
1716                 return 0;
1717
1718         if (!arg_network_veth)
1719                 return 0;
1720
1721         /* Use two different interface name prefixes depending whether
1722          * we are in bridge mode or not. */
1723         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1724                  arg_network_bridge ? "vb" : "ve", arg_machine);
1725
1726         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1727         if (r < 0)
1728                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1729
1730         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1731         if (r < 0)
1732                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1733
1734         r = sd_rtnl_open(&rtnl, 0);
1735         if (r < 0)
1736                 return log_error_errno(r, "Failed to connect to netlink: %m");
1737
1738         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1739         if (r < 0)
1740                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1741
1742         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1743         if (r < 0)
1744                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1745
1746         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1747         if (r < 0)
1748                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1749
1750         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to open netlink container: %m");
1753
1754         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1755         if (r < 0)
1756                 return log_error_errno(r, "Failed to open netlink container: %m");
1757
1758         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1759         if (r < 0)
1760                 return log_error_errno(r, "Failed to open netlink container: %m");
1761
1762         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1763         if (r < 0)
1764                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1765
1766         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1767         if (r < 0)
1768                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1769
1770         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1771         if (r < 0)
1772                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1773
1774         r = sd_rtnl_message_close_container(m);
1775         if (r < 0)
1776                 return log_error_errno(r, "Failed to close netlink container: %m");
1777
1778         r = sd_rtnl_message_close_container(m);
1779         if (r < 0)
1780                 return log_error_errno(r, "Failed to close netlink container: %m");
1781
1782         r = sd_rtnl_message_close_container(m);
1783         if (r < 0)
1784                 return log_error_errno(r, "Failed to close netlink container: %m");
1785
1786         r = sd_rtnl_call(rtnl, m, 0, NULL);
1787         if (r < 0)
1788                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1789
1790         i = (int) if_nametoindex(iface_name);
1791         if (i <= 0)
1792                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1793
1794         *ifi = i;
1795
1796         return 0;
1797 }
1798
1799 static int setup_bridge(const char veth_name[], int *ifi) {
1800         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1801         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1802         int r, bridge;
1803
1804         if (!arg_private_network)
1805                 return 0;
1806
1807         if (!arg_network_veth)
1808                 return 0;
1809
1810         if (!arg_network_bridge)
1811                 return 0;
1812
1813         bridge = (int) if_nametoindex(arg_network_bridge);
1814         if (bridge <= 0)
1815                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1816
1817         *ifi = bridge;
1818
1819         r = sd_rtnl_open(&rtnl, 0);
1820         if (r < 0)
1821                 return log_error_errno(r, "Failed to connect to netlink: %m");
1822
1823         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1824         if (r < 0)
1825                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1826
1827         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1828         if (r < 0)
1829                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1830
1831         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1832         if (r < 0)
1833                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1834
1835         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1836         if (r < 0)
1837                 return log_error_errno(r, "Failed to add netlink master field: %m");
1838
1839         r = sd_rtnl_call(rtnl, m, 0, NULL);
1840         if (r < 0)
1841                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1842
1843         return 0;
1844 }
1845
1846 static int parse_interface(struct udev *udev, const char *name) {
1847         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1848         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1849         int ifi;
1850
1851         ifi = (int) if_nametoindex(name);
1852         if (ifi <= 0)
1853                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1854
1855         sprintf(ifi_str, "n%i", ifi);
1856         d = udev_device_new_from_device_id(udev, ifi_str);
1857         if (!d)
1858                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1859
1860         if (udev_device_get_is_initialized(d) <= 0) {
1861                 log_error("Network interface %s is not initialized yet.", name);
1862                 return -EBUSY;
1863         }
1864
1865         return ifi;
1866 }
1867
1868 static int move_network_interfaces(pid_t pid) {
1869         _cleanup_udev_unref_ struct udev *udev = NULL;
1870         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1871         char **i;
1872         int r;
1873
1874         if (!arg_private_network)
1875                 return 0;
1876
1877         if (strv_isempty(arg_network_interfaces))
1878                 return 0;
1879
1880         r = sd_rtnl_open(&rtnl, 0);
1881         if (r < 0)
1882                 return log_error_errno(r, "Failed to connect to netlink: %m");
1883
1884         udev = udev_new();
1885         if (!udev) {
1886                 log_error("Failed to connect to udev.");
1887                 return -ENOMEM;
1888         }
1889
1890         STRV_FOREACH(i, arg_network_interfaces) {
1891                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1892                 int ifi;
1893
1894                 ifi = parse_interface(udev, *i);
1895                 if (ifi < 0)
1896                         return ifi;
1897
1898                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1899                 if (r < 0)
1900                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1901
1902                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1903                 if (r < 0)
1904                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1905
1906                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1907                 if (r < 0)
1908                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1909         }
1910
1911         return 0;
1912 }
1913
1914 static int setup_macvlan(pid_t pid) {
1915         _cleanup_udev_unref_ struct udev *udev = NULL;
1916         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1917         unsigned idx = 0;
1918         char **i;
1919         int r;
1920
1921         if (!arg_private_network)
1922                 return 0;
1923
1924         if (strv_isempty(arg_network_macvlan))
1925                 return 0;
1926
1927         r = sd_rtnl_open(&rtnl, 0);
1928         if (r < 0)
1929                 return log_error_errno(r, "Failed to connect to netlink: %m");
1930
1931         udev = udev_new();
1932         if (!udev) {
1933                 log_error("Failed to connect to udev.");
1934                 return -ENOMEM;
1935         }
1936
1937         STRV_FOREACH(i, arg_network_macvlan) {
1938                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1939                 _cleanup_free_ char *n = NULL;
1940                 struct ether_addr mac;
1941                 int ifi;
1942
1943                 ifi = parse_interface(udev, *i);
1944                 if (ifi < 0)
1945                         return ifi;
1946
1947                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1948                 if (r < 0)
1949                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1950
1951                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1952                 if (r < 0)
1953                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1954
1955                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1956                 if (r < 0)
1957                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1958
1959                 n = strappend("mv-", *i);
1960                 if (!n)
1961                         return log_oom();
1962
1963                 strshorten(n, IFNAMSIZ-1);
1964
1965                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1966                 if (r < 0)
1967                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1968
1969                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1970                 if (r < 0)
1971                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1972
1973                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1974                 if (r < 0)
1975                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1976
1977                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1978                 if (r < 0)
1979                         return log_error_errno(r, "Failed to open netlink container: %m");
1980
1981                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1982                 if (r < 0)
1983                         return log_error_errno(r, "Failed to open netlink container: %m");
1984
1985                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1986                 if (r < 0)
1987                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1988
1989                 r = sd_rtnl_message_close_container(m);
1990                 if (r < 0)
1991                         return log_error_errno(r, "Failed to close netlink container: %m");
1992
1993                 r = sd_rtnl_message_close_container(m);
1994                 if (r < 0)
1995                         return log_error_errno(r, "Failed to close netlink container: %m");
1996
1997                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1998                 if (r < 0)
1999                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2000         }
2001
2002         return 0;
2003 }
2004
2005 static int setup_seccomp(void) {
2006
2007 #ifdef HAVE_SECCOMP
2008         static const int blacklist[] = {
2009                 SCMP_SYS(kexec_load),
2010                 SCMP_SYS(open_by_handle_at),
2011                 SCMP_SYS(init_module),
2012                 SCMP_SYS(finit_module),
2013                 SCMP_SYS(delete_module),
2014                 SCMP_SYS(iopl),
2015                 SCMP_SYS(ioperm),
2016                 SCMP_SYS(swapon),
2017                 SCMP_SYS(swapoff),
2018         };
2019
2020         scmp_filter_ctx seccomp;
2021         unsigned i;
2022         int r;
2023
2024         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2025         if (!seccomp)
2026                 return log_oom();
2027
2028         r = seccomp_add_secondary_archs(seccomp);
2029         if (r < 0) {
2030                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2031                 goto finish;
2032         }
2033
2034         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2035                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2036                 if (r == -EFAULT)
2037                         continue; /* unknown syscall */
2038                 if (r < 0) {
2039                         log_error_errno(r, "Failed to block syscall: %m");
2040                         goto finish;
2041                 }
2042         }
2043
2044         /*
2045            Audit is broken in containers, much of the userspace audit
2046            hookup will fail if running inside a container. We don't
2047            care and just turn off creation of audit sockets.
2048
2049            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2050            with EAFNOSUPPORT which audit userspace uses as indication
2051            that audit is disabled in the kernel.
2052          */
2053
2054         r = seccomp_rule_add(
2055                         seccomp,
2056                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2057                         SCMP_SYS(socket),
2058                         2,
2059                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2060                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2061         if (r < 0) {
2062                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2063                 goto finish;
2064         }
2065
2066         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2067         if (r < 0) {
2068                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2069                 goto finish;
2070         }
2071
2072         r = seccomp_load(seccomp);
2073         if (r < 0)
2074                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2075
2076 finish:
2077         seccomp_release(seccomp);
2078         return r;
2079 #else
2080         return 0;
2081 #endif
2082
2083 }
2084
2085 static int setup_image(char **device_path, int *loop_nr) {
2086         struct loop_info64 info = {
2087                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2088         };
2089         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2090         _cleanup_free_ char* loopdev = NULL;
2091         struct stat st;
2092         int r, nr;
2093
2094         assert(device_path);
2095         assert(loop_nr);
2096         assert(arg_image);
2097
2098         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2099         if (fd < 0)
2100                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2101
2102         if (fstat(fd, &st) < 0)
2103                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2104
2105         if (S_ISBLK(st.st_mode)) {
2106                 char *p;
2107
2108                 p = strdup(arg_image);
2109                 if (!p)
2110                         return log_oom();
2111
2112                 *device_path = p;
2113
2114                 *loop_nr = -1;
2115
2116                 r = fd;
2117                 fd = -1;
2118
2119                 return r;
2120         }
2121
2122         if (!S_ISREG(st.st_mode)) {
2123                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2124                 return -EINVAL;
2125         }
2126
2127         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2128         if (control < 0)
2129                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2130
2131         nr = ioctl(control, LOOP_CTL_GET_FREE);
2132         if (nr < 0)
2133                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2134
2135         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2136                 return log_oom();
2137
2138         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2139         if (loop < 0)
2140                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2141
2142         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2143                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2144
2145         if (arg_read_only)
2146                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2147
2148         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2149                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2150
2151         *device_path = loopdev;
2152         loopdev = NULL;
2153
2154         *loop_nr = nr;
2155
2156         r = loop;
2157         loop = -1;
2158
2159         return r;
2160 }
2161
2162 static int dissect_image(
2163                 int fd,
2164                 char **root_device, bool *root_device_rw,
2165                 char **home_device, bool *home_device_rw,
2166                 char **srv_device, bool *srv_device_rw,
2167                 bool *secondary) {
2168
2169 #ifdef HAVE_BLKID
2170         int home_nr = -1, srv_nr = -1;
2171 #ifdef GPT_ROOT_NATIVE
2172         int root_nr = -1;
2173 #endif
2174 #ifdef GPT_ROOT_SECONDARY
2175         int secondary_root_nr = -1;
2176 #endif
2177
2178         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2179         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2180         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2181         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2182         _cleanup_udev_unref_ struct udev *udev = NULL;
2183         struct udev_list_entry *first, *item;
2184         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2185         const char *pttype = NULL;
2186         blkid_partlist pl;
2187         struct stat st;
2188         int r;
2189
2190         assert(fd >= 0);
2191         assert(root_device);
2192         assert(home_device);
2193         assert(srv_device);
2194         assert(secondary);
2195         assert(arg_image);
2196
2197         b = blkid_new_probe();
2198         if (!b)
2199                 return log_oom();
2200
2201         errno = 0;
2202         r = blkid_probe_set_device(b, fd, 0, 0);
2203         if (r != 0) {
2204                 if (errno == 0)
2205                         return log_oom();
2206
2207                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2208                 return -errno;
2209         }
2210
2211         blkid_probe_enable_partitions(b, 1);
2212         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2213
2214         errno = 0;
2215         r = blkid_do_safeprobe(b);
2216         if (r == -2 || r == 1) {
2217                 log_error("Failed to identify any partition table on %s.\n"
2218                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2219                 return -EINVAL;
2220         } else if (r != 0) {
2221                 if (errno == 0)
2222                         errno = EIO;
2223                 log_error_errno(errno, "Failed to probe: %m");
2224                 return -errno;
2225         }
2226
2227         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2228         if (!streq_ptr(pttype, "gpt")) {
2229                 log_error("Image %s does not carry a GUID Partition Table.\n"
2230                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2231                 return -EINVAL;
2232         }
2233
2234         errno = 0;
2235         pl = blkid_probe_get_partitions(b);
2236         if (!pl) {
2237                 if (errno == 0)
2238                         return log_oom();
2239
2240                 log_error("Failed to list partitions of %s", arg_image);
2241                 return -errno;
2242         }
2243
2244         udev = udev_new();
2245         if (!udev)
2246                 return log_oom();
2247
2248         if (fstat(fd, &st) < 0)
2249                 return log_error_errno(errno, "Failed to stat block device: %m");
2250
2251         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2252         if (!d)
2253                 return log_oom();
2254
2255         e = udev_enumerate_new(udev);
2256         if (!e)
2257                 return log_oom();
2258
2259         r = udev_enumerate_add_match_parent(e, d);
2260         if (r < 0)
2261                 return log_oom();
2262
2263         r = udev_enumerate_scan_devices(e);
2264         if (r < 0)
2265                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2266
2267         first = udev_enumerate_get_list_entry(e);
2268         udev_list_entry_foreach(item, first) {
2269                 _cleanup_udev_device_unref_ struct udev_device *q;
2270                 const char *stype, *node;
2271                 unsigned long long flags;
2272                 sd_id128_t type_id;
2273                 blkid_partition pp;
2274                 dev_t qn;
2275                 int nr;
2276
2277                 errno = 0;
2278                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2279                 if (!q) {
2280                         if (!errno)
2281                                 errno = ENOMEM;
2282
2283                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2284                         return -errno;
2285                 }
2286
2287                 qn = udev_device_get_devnum(q);
2288                 if (major(qn) == 0)
2289                         continue;
2290
2291                 if (st.st_rdev == qn)
2292                         continue;
2293
2294                 node = udev_device_get_devnode(q);
2295                 if (!node)
2296                         continue;
2297
2298                 pp = blkid_partlist_devno_to_partition(pl, qn);
2299                 if (!pp)
2300                         continue;
2301
2302                 flags = blkid_partition_get_flags(pp);
2303                 if (flags & GPT_FLAG_NO_AUTO)
2304                         continue;
2305
2306                 nr = blkid_partition_get_partno(pp);
2307                 if (nr < 0)
2308                         continue;
2309
2310                 stype = blkid_partition_get_type_string(pp);
2311                 if (!stype)
2312                         continue;
2313
2314                 if (sd_id128_from_string(stype, &type_id) < 0)
2315                         continue;
2316
2317                 if (sd_id128_equal(type_id, GPT_HOME)) {
2318
2319                         if (home && nr >= home_nr)
2320                                 continue;
2321
2322                         home_nr = nr;
2323                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2324
2325                         free(home);
2326                         home = strdup(node);
2327                         if (!home)
2328                                 return log_oom();
2329                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2330
2331                         if (srv && nr >= srv_nr)
2332                                 continue;
2333
2334                         srv_nr = nr;
2335                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2336
2337                         free(srv);
2338                         srv = strdup(node);
2339                         if (!srv)
2340                                 return log_oom();
2341                 }
2342 #ifdef GPT_ROOT_NATIVE
2343                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2344
2345                         if (root && nr >= root_nr)
2346                                 continue;
2347
2348                         root_nr = nr;
2349                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2350
2351                         free(root);
2352                         root = strdup(node);
2353                         if (!root)
2354                                 return log_oom();
2355                 }
2356 #endif
2357 #ifdef GPT_ROOT_SECONDARY
2358                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2359
2360                         if (secondary_root && nr >= secondary_root_nr)
2361                                 continue;
2362
2363                         secondary_root_nr = nr;
2364                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2365
2366
2367                         free(secondary_root);
2368                         secondary_root = strdup(node);
2369                         if (!secondary_root)
2370                                 return log_oom();
2371                 }
2372 #endif
2373         }
2374
2375         if (!root && !secondary_root) {
2376                 log_error("Failed to identify root partition in disk image %s.\n"
2377                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2378                 return -EINVAL;
2379         }
2380
2381         if (root) {
2382                 *root_device = root;
2383                 root = NULL;
2384
2385                 *root_device_rw = root_rw;
2386                 *secondary = false;
2387         } else if (secondary_root) {
2388                 *root_device = secondary_root;
2389                 secondary_root = NULL;
2390
2391                 *root_device_rw = secondary_root_rw;
2392                 *secondary = true;
2393         }
2394
2395         if (home) {
2396                 *home_device = home;
2397                 home = NULL;
2398
2399                 *home_device_rw = home_rw;
2400         }
2401
2402         if (srv) {
2403                 *srv_device = srv;
2404                 srv = NULL;
2405
2406                 *srv_device_rw = srv_rw;
2407         }
2408
2409         return 0;
2410 #else
2411         log_error("--image= is not supported, compiled without blkid support.");
2412         return -ENOTSUP;
2413 #endif
2414 }
2415
2416 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2417 #ifdef HAVE_BLKID
2418         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2419         const char *fstype, *p;
2420         int r;
2421
2422         assert(what);
2423         assert(where);
2424
2425         if (arg_read_only)
2426                 rw = false;
2427
2428         if (directory)
2429                 p = strappenda(where, directory);
2430         else
2431                 p = where;
2432
2433         errno = 0;
2434         b = blkid_new_probe_from_filename(what);
2435         if (!b) {
2436                 if (errno == 0)
2437                         return log_oom();
2438                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2439                 return -errno;
2440         }
2441
2442         blkid_probe_enable_superblocks(b, 1);
2443         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2444
2445         errno = 0;
2446         r = blkid_do_safeprobe(b);
2447         if (r == -1 || r == 1) {
2448                 log_error("Cannot determine file system type of %s", what);
2449                 return -EINVAL;
2450         } else if (r != 0) {
2451                 if (errno == 0)
2452                         errno = EIO;
2453                 log_error_errno(errno, "Failed to probe %s: %m", what);
2454                 return -errno;
2455         }
2456
2457         errno = 0;
2458         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2459                 if (errno == 0)
2460                         errno = EINVAL;
2461                 log_error("Failed to determine file system type of %s", what);
2462                 return -errno;
2463         }
2464
2465         if (streq(fstype, "crypto_LUKS")) {
2466                 log_error("nspawn currently does not support LUKS disk images.");
2467                 return -ENOTSUP;
2468         }
2469
2470         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2471                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2472
2473         return 0;
2474 #else
2475         log_error("--image= is not supported, compiled without blkid support.");
2476         return -ENOTSUP;
2477 #endif
2478 }
2479
2480 static int mount_devices(
2481                 const char *where,
2482                 const char *root_device, bool root_device_rw,
2483                 const char *home_device, bool home_device_rw,
2484                 const char *srv_device, bool srv_device_rw) {
2485         int r;
2486
2487         assert(where);
2488
2489         if (root_device) {
2490                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2491                 if (r < 0)
2492                         return log_error_errno(r, "Failed to mount root directory: %m");
2493         }
2494
2495         if (home_device) {
2496                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2497                 if (r < 0)
2498                         return log_error_errno(r, "Failed to mount home directory: %m");
2499         }
2500
2501         if (srv_device) {
2502                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2503                 if (r < 0)
2504                         return log_error_errno(r, "Failed to mount server data directory: %m");
2505         }
2506
2507         return 0;
2508 }
2509
2510 static void loop_remove(int nr, int *image_fd) {
2511         _cleanup_close_ int control = -1;
2512         int r;
2513
2514         if (nr < 0)
2515                 return;
2516
2517         if (image_fd && *image_fd >= 0) {
2518                 r = ioctl(*image_fd, LOOP_CLR_FD);
2519                 if (r < 0)
2520                         log_warning_errno(errno, "Failed to close loop image: %m");
2521                 *image_fd = safe_close(*image_fd);
2522         }
2523
2524         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2525         if (control < 0) {
2526                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2527                 return;
2528         }
2529
2530         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2531         if (r < 0)
2532                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2533 }
2534
2535 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2536         int pipe_fds[2];
2537         pid_t pid;
2538
2539         assert(database);
2540         assert(key);
2541         assert(rpid);
2542
2543         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2544                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2545
2546         pid = fork();
2547         if (pid < 0)
2548                 return log_error_errno(errno, "Failed to fork getent child: %m");
2549         else if (pid == 0) {
2550                 int nullfd;
2551                 char *empty_env = NULL;
2552
2553                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2554                         _exit(EXIT_FAILURE);
2555
2556                 if (pipe_fds[0] > 2)
2557                         safe_close(pipe_fds[0]);
2558                 if (pipe_fds[1] > 2)
2559                         safe_close(pipe_fds[1]);
2560
2561                 nullfd = open("/dev/null", O_RDWR);
2562                 if (nullfd < 0)
2563                         _exit(EXIT_FAILURE);
2564
2565                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2566                         _exit(EXIT_FAILURE);
2567
2568                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2569                         _exit(EXIT_FAILURE);
2570
2571                 if (nullfd > 2)
2572                         safe_close(nullfd);
2573
2574                 reset_all_signal_handlers();
2575                 close_all_fds(NULL, 0);
2576
2577                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2578                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2579                 _exit(EXIT_FAILURE);
2580         }
2581
2582         pipe_fds[1] = safe_close(pipe_fds[1]);
2583
2584         *rpid = pid;
2585
2586         return pipe_fds[0];
2587 }
2588
2589 static int change_uid_gid(char **_home) {
2590         char line[LINE_MAX], *x, *u, *g, *h;
2591         const char *word, *state;
2592         _cleanup_free_ uid_t *uids = NULL;
2593         _cleanup_free_ char *home = NULL;
2594         _cleanup_fclose_ FILE *f = NULL;
2595         _cleanup_close_ int fd = -1;
2596         unsigned n_uids = 0;
2597         size_t sz = 0, l;
2598         uid_t uid;
2599         gid_t gid;
2600         pid_t pid;
2601         int r;
2602
2603         assert(_home);
2604
2605         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2606                 /* Reset everything fully to 0, just in case */
2607
2608                 if (setgroups(0, NULL) < 0)
2609                         return log_error_errno(errno, "setgroups() failed: %m");
2610
2611                 if (setresgid(0, 0, 0) < 0)
2612                         return log_error_errno(errno, "setregid() failed: %m");
2613
2614                 if (setresuid(0, 0, 0) < 0)
2615                         return log_error_errno(errno, "setreuid() failed: %m");
2616
2617                 *_home = NULL;
2618                 return 0;
2619         }
2620
2621         /* First, get user credentials */
2622         fd = spawn_getent("passwd", arg_user, &pid);
2623         if (fd < 0)
2624                 return fd;
2625
2626         f = fdopen(fd, "r");
2627         if (!f)
2628                 return log_oom();
2629         fd = -1;
2630
2631         if (!fgets(line, sizeof(line), f)) {
2632
2633                 if (!ferror(f)) {
2634                         log_error("Failed to resolve user %s.", arg_user);
2635                         return -ESRCH;
2636                 }
2637
2638                 log_error_errno(errno, "Failed to read from getent: %m");
2639                 return -errno;
2640         }
2641
2642         truncate_nl(line);
2643
2644         wait_for_terminate_and_warn("getent passwd", pid, true);
2645
2646         x = strchr(line, ':');
2647         if (!x) {
2648                 log_error("/etc/passwd entry has invalid user field.");
2649                 return -EIO;
2650         }
2651
2652         u = strchr(x+1, ':');
2653         if (!u) {
2654                 log_error("/etc/passwd entry has invalid password field.");
2655                 return -EIO;
2656         }
2657
2658         u++;
2659         g = strchr(u, ':');
2660         if (!g) {
2661                 log_error("/etc/passwd entry has invalid UID field.");
2662                 return -EIO;
2663         }
2664
2665         *g = 0;
2666         g++;
2667         x = strchr(g, ':');
2668         if (!x) {
2669                 log_error("/etc/passwd entry has invalid GID field.");
2670                 return -EIO;
2671         }
2672
2673         *x = 0;
2674         h = strchr(x+1, ':');
2675         if (!h) {
2676                 log_error("/etc/passwd entry has invalid GECOS field.");
2677                 return -EIO;
2678         }
2679
2680         h++;
2681         x = strchr(h, ':');
2682         if (!x) {
2683                 log_error("/etc/passwd entry has invalid home directory field.");
2684                 return -EIO;
2685         }
2686
2687         *x = 0;
2688
2689         r = parse_uid(u, &uid);
2690         if (r < 0) {
2691                 log_error("Failed to parse UID of user.");
2692                 return -EIO;
2693         }
2694
2695         r = parse_gid(g, &gid);
2696         if (r < 0) {
2697                 log_error("Failed to parse GID of user.");
2698                 return -EIO;
2699         }
2700
2701         home = strdup(h);
2702         if (!home)
2703                 return log_oom();
2704
2705         /* Second, get group memberships */
2706         fd = spawn_getent("initgroups", arg_user, &pid);
2707         if (fd < 0)
2708                 return fd;
2709
2710         fclose(f);
2711         f = fdopen(fd, "r");
2712         if (!f)
2713                 return log_oom();
2714         fd = -1;
2715
2716         if (!fgets(line, sizeof(line), f)) {
2717                 if (!ferror(f)) {
2718                         log_error("Failed to resolve user %s.", arg_user);
2719                         return -ESRCH;
2720                 }
2721
2722                 log_error_errno(errno, "Failed to read from getent: %m");
2723                 return -errno;
2724         }
2725
2726         truncate_nl(line);
2727
2728         wait_for_terminate_and_warn("getent initgroups", pid, true);
2729
2730         /* Skip over the username and subsequent separator whitespace */
2731         x = line;
2732         x += strcspn(x, WHITESPACE);
2733         x += strspn(x, WHITESPACE);
2734
2735         FOREACH_WORD(word, l, x, state) {
2736                 char c[l+1];
2737
2738                 memcpy(c, word, l);
2739                 c[l] = 0;
2740
2741                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2742                         return log_oom();
2743
2744                 r = parse_uid(c, &uids[n_uids++]);
2745                 if (r < 0) {
2746                         log_error("Failed to parse group data from getent.");
2747                         return -EIO;
2748                 }
2749         }
2750
2751         r = mkdir_parents(home, 0775);
2752         if (r < 0)
2753                 return log_error_errno(r, "Failed to make home root directory: %m");
2754
2755         r = mkdir_safe(home, 0755, uid, gid);
2756         if (r < 0 && r != -EEXIST)
2757                 return log_error_errno(r, "Failed to make home directory: %m");
2758
2759         fchown(STDIN_FILENO, uid, gid);
2760         fchown(STDOUT_FILENO, uid, gid);
2761         fchown(STDERR_FILENO, uid, gid);
2762
2763         if (setgroups(n_uids, uids) < 0)
2764                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2765
2766         if (setresgid(gid, gid, gid) < 0)
2767                 return log_error_errno(errno, "setregid() failed: %m");
2768
2769         if (setresuid(uid, uid, uid) < 0)
2770                 return log_error_errno(errno, "setreuid() failed: %m");
2771
2772         if (_home) {
2773                 *_home = home;
2774                 home = NULL;
2775         }
2776
2777         return 0;
2778 }
2779
2780 /*
2781  * Return values:
2782  * < 0 : wait_for_terminate() failed to get the state of the
2783  *       container, the container was terminated by a signal, or
2784  *       failed for an unknown reason.  No change is made to the
2785  *       container argument.
2786  * > 0 : The program executed in the container terminated with an
2787  *       error.  The exit code of the program executed in the
2788  *       container is returned.  The container argument has been set
2789  *       to CONTAINER_TERMINATED.
2790  *   0 : The container is being rebooted, has been shut down or exited
2791  *       successfully.  The container argument has been set to either
2792  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2793  *
2794  * That is, success is indicated by a return value of zero, and an
2795  * error is indicated by a non-zero value.
2796  */
2797 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2798         siginfo_t status;
2799         int r;
2800
2801         r = wait_for_terminate(pid, &status);
2802         if (r < 0)
2803                 return log_warning_errno(r, "Failed to wait for container: %m");
2804
2805         switch (status.si_code) {
2806
2807         case CLD_EXITED:
2808                 if (status.si_status == 0) {
2809                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2810
2811                 } else
2812                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2813
2814                 *container = CONTAINER_TERMINATED;
2815                 return status.si_status;
2816
2817         case CLD_KILLED:
2818                 if (status.si_status == SIGINT) {
2819
2820                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2821                         *container = CONTAINER_TERMINATED;
2822                         return 0;
2823
2824                 } else if (status.si_status == SIGHUP) {
2825
2826                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2827                         *container = CONTAINER_REBOOTED;
2828                         return 0;
2829                 }
2830
2831                 /* CLD_KILLED fallthrough */
2832
2833         case CLD_DUMPED:
2834                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2835                 return -EIO;
2836
2837         default:
2838                 log_error("Container %s failed due to unknown reason.", arg_machine);
2839                 return -EIO;
2840         }
2841
2842         return r;
2843 }
2844
2845 static void nop_handler(int sig) {}
2846
2847 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2848         pid_t pid;
2849
2850         pid = PTR_TO_UINT32(userdata);
2851         if (pid > 0) {
2852                 if (kill(pid, SIGRTMIN+3) >= 0) {
2853                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2854                         sd_event_source_set_userdata(s, NULL);
2855                         return 0;
2856                 }
2857         }
2858
2859         sd_event_exit(sd_event_source_get_event(s), 0);
2860         return 0;
2861 }
2862
2863 static int determine_names(void) {
2864
2865         if (!arg_image && !arg_directory) {
2866                 if (arg_machine)
2867                         arg_directory = strappend("/var/lib/container/", arg_machine);
2868                 else
2869                         arg_directory = get_current_dir_name();
2870
2871                 if (!arg_directory) {
2872                         log_error("Failed to determine path, please use -D.");
2873                         return -EINVAL;
2874                 }
2875         }
2876
2877         if (!arg_machine) {
2878                 arg_machine = strdup(basename(arg_image ?: arg_directory));
2879                 if (!arg_machine)
2880                         return log_oom();
2881
2882                 hostname_cleanup(arg_machine, false);
2883                 if (!machine_name_is_valid(arg_machine)) {
2884                         log_error("Failed to determine machine name automatically, please use -M.");
2885                         return -EINVAL;
2886                 }
2887         }
2888
2889         return 0;
2890 }
2891
2892 int main(int argc, char *argv[]) {
2893
2894         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2895         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2896         _cleanup_close_ int master = -1, image_fd = -1;
2897         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2898         _cleanup_fdset_free_ FDSet *fds = NULL;
2899         int r, n_fd_passed, loop_nr = -1;
2900         const char *console = NULL;
2901         char veth_name[IFNAMSIZ];
2902         bool secondary = false, remove_subvol = false;
2903         sigset_t mask, mask_chld;
2904         pid_t pid = 0;
2905         int ret = EXIT_SUCCESS;
2906
2907         log_parse_environment();
2908         log_open();
2909
2910         r = parse_argv(argc, argv);
2911         if (r <= 0)
2912                 goto finish;
2913
2914         r = determine_names();
2915         if (r < 0)
2916                 goto finish;
2917
2918         if (geteuid() != 0) {
2919                 log_error("Need to be root.");
2920                 r = -EPERM;
2921                 goto finish;
2922         }
2923
2924         if (sd_booted() <= 0) {
2925                 log_error("Not running on a systemd system.");
2926                 r = -EINVAL;
2927                 goto finish;
2928         }
2929
2930         log_close();
2931         n_fd_passed = sd_listen_fds(false);
2932         if (n_fd_passed > 0) {
2933                 r = fdset_new_listen_fds(&fds, false);
2934                 if (r < 0) {
2935                         log_error_errno(r, "Failed to collect file descriptors: %m");
2936                         goto finish;
2937                 }
2938         }
2939         fdset_close_others(fds);
2940         log_open();
2941
2942         if (arg_directory) {
2943                 assert(!arg_image);
2944
2945                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
2946                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
2947                         r = -EINVAL;
2948                         goto finish;
2949                 }
2950
2951                 if (arg_template) {
2952                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2953                         if (r == -EEXIST) {
2954                                 if (!arg_quiet)
2955                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2956                         } else if (r < 0) {
2957                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2958                                 goto finish;
2959                         } else {
2960                                 if (!arg_quiet)
2961                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
2962                         }
2963
2964                 } else if (arg_ephemeral) {
2965                         char *np;
2966
2967                         /* If the specified path is a mount point we
2968                          * generate the new snapshot immediately
2969                          * inside it under a random name. However if
2970                          * the specified is not a mount point we
2971                          * create the new snapshot in the parent
2972                          * directory, just next to it. */
2973                         r = path_is_mount_point(arg_directory, false);
2974                         if (r < 0) {
2975                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
2976                                 goto finish;
2977                         }
2978                         if (r > 0)
2979                                 r = tempfn_random_child(arg_directory, &np);
2980                         else
2981                                 r = tempfn_random(arg_directory, &np);
2982                         if (r < 0) {
2983                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
2984                                 goto finish;
2985                         }
2986
2987                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
2988                         if (r < 0) {
2989                                 free(np);
2990                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
2991                                 goto finish;
2992                         }
2993
2994                         free(arg_directory);
2995                         arg_directory = np;
2996
2997                         remove_subvol = true;
2998                 }
2999
3000                 if (arg_boot) {
3001                         if (path_is_os_tree(arg_directory) <= 0) {
3002                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3003                                 r = -EINVAL;
3004                                 goto finish;
3005                         }
3006                 } else {
3007                         const char *p;
3008
3009                         p = strappenda(arg_directory,
3010                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3011                         if (access(p, F_OK) < 0) {
3012                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3013                                 r = -EINVAL;
3014                                 goto finish;
3015                         }
3016                 }
3017
3018         } else {
3019                 char template[] = "/tmp/nspawn-root-XXXXXX";
3020
3021                 assert(arg_image);
3022                 assert(!arg_template);
3023
3024                 if (!mkdtemp(template)) {
3025                         log_error_errno(errno, "Failed to create temporary directory: %m");
3026                         r = -errno;
3027                         goto finish;
3028                 }
3029
3030                 arg_directory = strdup(template);
3031                 if (!arg_directory) {
3032                         r = log_oom();
3033                         goto finish;
3034                 }
3035
3036                 image_fd = setup_image(&device_path, &loop_nr);
3037                 if (image_fd < 0) {
3038                         r = image_fd;
3039                         goto finish;
3040                 }
3041
3042                 r = dissect_image(image_fd,
3043                                   &root_device, &root_device_rw,
3044                                   &home_device, &home_device_rw,
3045                                   &srv_device, &srv_device_rw,
3046                                   &secondary);
3047                 if (r < 0)
3048                         goto finish;
3049         }
3050
3051         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3052         if (master < 0) {
3053                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3054                 goto finish;
3055         }
3056
3057         console = ptsname(master);
3058         if (!console) {
3059                 r = log_error_errno(errno, "Failed to determine tty name: %m");
3060                 goto finish;
3061         }
3062
3063         if (!arg_quiet)
3064                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3065                          arg_machine, arg_image ?: arg_directory);
3066
3067         if (unlockpt(master) < 0) {
3068                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3069                 goto finish;
3070         }
3071
3072         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3073                 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3074                 goto finish;
3075         }
3076
3077         sd_notify(false,
3078                   "READY=1\n"
3079                   "STATUS=Container running.");
3080
3081         assert_se(sigemptyset(&mask) == 0);
3082         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3083         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3084
3085         assert_se(sigemptyset(&mask_chld) == 0);
3086         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3087
3088         for (;;) {
3089                 ContainerStatus container_status;
3090                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3091                 struct sigaction sa = {
3092                         .sa_handler = nop_handler,
3093                         .sa_flags = SA_NOCLDSTOP,
3094                 };
3095
3096                 r = barrier_create(&barrier);
3097                 if (r < 0) {
3098                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3099                         goto finish;
3100                 }
3101
3102                 /* Child can be killed before execv(), so handle SIGCHLD
3103                  * in order to interrupt parent's blocking calls and
3104                  * give it a chance to call wait() and terminate. */
3105                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3106                 if (r < 0) {
3107                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3108                         goto finish;
3109                 }
3110
3111                 r = sigaction(SIGCHLD, &sa, NULL);
3112                 if (r < 0) {
3113                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3114                         goto finish;
3115                 }
3116
3117                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3118                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3119                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3120                 if (pid < 0) {
3121                         if (errno == EINVAL)
3122                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3123                         else
3124                                 r = log_error_errno(errno, "clone() failed: %m");
3125
3126                         goto finish;
3127                 }
3128
3129                 if (pid == 0) {
3130                         /* child */
3131                         _cleanup_free_ char *home = NULL;
3132                         unsigned n_env = 2;
3133                         const char *envp[] = {
3134                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3135                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3136                                 NULL, /* TERM */
3137                                 NULL, /* HOME */
3138                                 NULL, /* USER */
3139                                 NULL, /* LOGNAME */
3140                                 NULL, /* container_uuid */
3141                                 NULL, /* LISTEN_FDS */
3142                                 NULL, /* LISTEN_PID */
3143                                 NULL
3144                         };
3145                         char **env_use;
3146
3147                         barrier_set_role(&barrier, BARRIER_CHILD);
3148
3149                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3150                         if (envp[n_env])
3151                                 n_env ++;
3152
3153                         master = safe_close(master);
3154
3155                         close_nointr(STDIN_FILENO);
3156                         close_nointr(STDOUT_FILENO);
3157                         close_nointr(STDERR_FILENO);
3158
3159                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3160
3161                         reset_all_signal_handlers();
3162                         reset_signal_mask();
3163
3164                         r = open_terminal(console, O_RDWR);
3165                         if (r != STDIN_FILENO) {
3166                                 if (r >= 0) {
3167                                         safe_close(r);
3168                                         r = -EINVAL;
3169                                 }
3170
3171                                 log_error_errno(r, "Failed to open console: %m");
3172                                 _exit(EXIT_FAILURE);
3173                         }
3174
3175                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3176                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3177                                 log_error_errno(errno, "Failed to duplicate console: %m");
3178                                 _exit(EXIT_FAILURE);
3179                         }
3180
3181                         if (setsid() < 0) {
3182                                 log_error_errno(errno, "setsid() failed: %m");
3183                                 _exit(EXIT_FAILURE);
3184                         }
3185
3186                         if (reset_audit_loginuid() < 0)
3187                                 _exit(EXIT_FAILURE);
3188
3189                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3190                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3191                                 _exit(EXIT_FAILURE);
3192                         }
3193
3194                         /* Mark everything as slave, so that we still
3195                          * receive mounts from the real root, but don't
3196                          * propagate mounts to the real root. */
3197                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3198                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3199                                 _exit(EXIT_FAILURE);
3200                         }
3201
3202                         if (mount_devices(arg_directory,
3203                                           root_device, root_device_rw,
3204                                           home_device, home_device_rw,
3205                                           srv_device, srv_device_rw) < 0)
3206                                 _exit(EXIT_FAILURE);
3207
3208                         /* Turn directory into bind mount */
3209                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3210                                 log_error_errno(errno, "Failed to make bind mount: %m");
3211                                 _exit(EXIT_FAILURE);
3212                         }
3213
3214                         r = setup_volatile(arg_directory);
3215                         if (r < 0)
3216                                 _exit(EXIT_FAILURE);
3217
3218                         if (setup_volatile_state(arg_directory) < 0)
3219                                 _exit(EXIT_FAILURE);
3220
3221                         r = base_filesystem_create(arg_directory);
3222                         if (r < 0)
3223                                 _exit(EXIT_FAILURE);
3224
3225                         if (arg_read_only) {
3226                                 r = bind_remount_recursive(arg_directory, true);
3227                                 if (r < 0) {
3228                                         log_error_errno(r, "Failed to make tree read-only: %m");
3229                                         _exit(EXIT_FAILURE);
3230                                 }
3231                         }
3232
3233                         if (mount_all(arg_directory) < 0)
3234                                 _exit(EXIT_FAILURE);
3235
3236                         if (copy_devnodes(arg_directory) < 0)
3237                                 _exit(EXIT_FAILURE);
3238
3239                         if (setup_ptmx(arg_directory) < 0)
3240                                 _exit(EXIT_FAILURE);
3241
3242                         dev_setup(arg_directory);
3243
3244                         if (setup_seccomp() < 0)
3245                                 _exit(EXIT_FAILURE);
3246
3247                         if (setup_dev_console(arg_directory, console) < 0)
3248                                 _exit(EXIT_FAILURE);
3249
3250                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3251                                 _exit(EXIT_FAILURE);
3252
3253                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3254
3255                         if (setup_boot_id(arg_directory) < 0)
3256                                 _exit(EXIT_FAILURE);
3257
3258                         if (setup_timezone(arg_directory) < 0)
3259                                 _exit(EXIT_FAILURE);
3260
3261                         if (setup_resolv_conf(arg_directory) < 0)
3262                                 _exit(EXIT_FAILURE);
3263
3264                         if (setup_journal(arg_directory) < 0)
3265                                 _exit(EXIT_FAILURE);
3266
3267                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3268                                 _exit(EXIT_FAILURE);
3269
3270                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3271                                 _exit(EXIT_FAILURE);
3272
3273                         if (mount_tmpfs(arg_directory) < 0)
3274                                 _exit(EXIT_FAILURE);
3275
3276                         /* Tell the parent that we are ready, and that
3277                          * it can cgroupify us to that we lack access
3278                          * to certain devices and resources. */
3279                         (void)barrier_place(&barrier);
3280
3281                         if (chdir(arg_directory) < 0) {
3282                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3283                                 _exit(EXIT_FAILURE);
3284                         }
3285
3286                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3287                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3288                                 _exit(EXIT_FAILURE);
3289                         }
3290
3291                         if (chroot(".") < 0) {
3292                                 log_error_errno(errno, "chroot() failed: %m");
3293                                 _exit(EXIT_FAILURE);
3294                         }
3295
3296                         if (chdir("/") < 0) {
3297                                 log_error_errno(errno, "chdir() failed: %m");
3298                                 _exit(EXIT_FAILURE);
3299                         }
3300
3301                         umask(0022);
3302
3303                         if (arg_private_network)
3304                                 loopback_setup();
3305
3306                         if (drop_capabilities() < 0) {
3307                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3308                                 _exit(EXIT_FAILURE);
3309                         }
3310
3311                         r = change_uid_gid(&home);
3312                         if (r < 0)
3313                                 _exit(EXIT_FAILURE);
3314
3315                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3316                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3317                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3318                                 log_oom();
3319                                 _exit(EXIT_FAILURE);
3320                         }
3321
3322                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3323                                 char as_uuid[37];
3324
3325                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3326                                         log_oom();
3327                                         _exit(EXIT_FAILURE);
3328                                 }
3329                         }
3330
3331                         if (fdset_size(fds) > 0) {
3332                                 r = fdset_cloexec(fds, false);
3333                                 if (r < 0) {
3334                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3335                                         _exit(EXIT_FAILURE);
3336                                 }
3337
3338                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3339                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3340                                         log_oom();
3341                                         _exit(EXIT_FAILURE);
3342                                 }
3343                         }
3344
3345                         setup_hostname();
3346
3347                         if (arg_personality != 0xffffffffLU) {
3348                                 if (personality(arg_personality) < 0) {
3349                                         log_error_errno(errno, "personality() failed: %m");
3350                                         _exit(EXIT_FAILURE);
3351                                 }
3352                         } else if (secondary) {
3353                                 if (personality(PER_LINUX32) < 0) {
3354                                         log_error_errno(errno, "personality() failed: %m");
3355                                         _exit(EXIT_FAILURE);
3356                                 }
3357                         }
3358
3359 #ifdef HAVE_SELINUX
3360                         if (arg_selinux_context)
3361                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3362                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3363                                         _exit(EXIT_FAILURE);
3364                                 }
3365 #endif
3366
3367                         if (!strv_isempty(arg_setenv)) {
3368                                 char **n;
3369
3370                                 n = strv_env_merge(2, envp, arg_setenv);
3371                                 if (!n) {
3372                                         log_oom();
3373                                         _exit(EXIT_FAILURE);
3374                                 }
3375
3376                                 env_use = n;
3377                         } else
3378                                 env_use = (char**) envp;
3379
3380                         /* Wait until the parent is ready with the setup, too... */
3381                         if (!barrier_place_and_sync(&barrier))
3382                                 _exit(EXIT_FAILURE);
3383
3384                         if (arg_boot) {
3385                                 char **a;
3386                                 size_t l;
3387
3388                                 /* Automatically search for the init system */
3389
3390                                 l = 1 + argc - optind;
3391                                 a = newa(char*, l + 1);
3392                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3393
3394                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3395                                 execve(a[0], a, env_use);
3396
3397                                 a[0] = (char*) "/lib/systemd/systemd";
3398                                 execve(a[0], a, env_use);
3399
3400                                 a[0] = (char*) "/sbin/init";
3401                                 execve(a[0], a, env_use);
3402                         } else if (argc > optind)
3403                                 execvpe(argv[optind], argv + optind, env_use);
3404                         else {
3405                                 chdir(home ? home : "/root");
3406                                 execle("/bin/bash", "-bash", NULL, env_use);
3407                                 execle("/bin/sh", "-sh", NULL, env_use);
3408                         }
3409
3410                         log_error_errno(errno, "execv() failed: %m");
3411                         _exit(EXIT_FAILURE);
3412                 }
3413
3414                 barrier_set_role(&barrier, BARRIER_PARENT);
3415                 fdset_free(fds);
3416                 fds = NULL;
3417
3418                 /* wait for child-setup to be done */
3419                 if (barrier_place_and_sync(&barrier)) {
3420                         _cleanup_event_unref_ sd_event *event = NULL;
3421                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3422                         int ifi = 0;
3423
3424                         r = move_network_interfaces(pid);
3425                         if (r < 0)
3426                                 goto finish;
3427
3428                         r = setup_veth(pid, veth_name, &ifi);
3429                         if (r < 0)
3430                                 goto finish;
3431
3432                         r = setup_bridge(veth_name, &ifi);
3433                         if (r < 0)
3434                                 goto finish;
3435
3436                         r = setup_macvlan(pid);
3437                         if (r < 0)
3438                                 goto finish;
3439
3440                         r = register_machine(pid, ifi);
3441                         if (r < 0)
3442                                 goto finish;
3443
3444                         /* Block SIGCHLD here, before notifying child.
3445                          * process_pty() will handle it with the other signals. */
3446                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3447                         if (r < 0)
3448                                 goto finish;
3449
3450                         /* Reset signal to default */
3451                         r = default_signals(SIGCHLD, -1);
3452                         if (r < 0)
3453                                 goto finish;
3454
3455                         /* Notify the child that the parent is ready with all
3456                          * its setup, and that the child can now hand over
3457                          * control to the code to run inside the container. */
3458                         (void)barrier_place(&barrier);
3459
3460                         r = sd_event_new(&event);
3461                         if (r < 0) {
3462                                 log_error_errno(r, "Failed to get default event source: %m");
3463                                 goto finish;
3464                         }
3465
3466                         if (arg_boot) {
3467                                 /* Try to kill the init system on SIGINT or SIGTERM */
3468                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3469                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3470                         } else {
3471                                 /* Immediately exit */
3472                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3473                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3474                         }
3475
3476                         /* simply exit on sigchld */
3477                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3478
3479                         r = pty_forward_new(event, master, &forward);
3480                         if (r < 0) {
3481                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3482                                 goto finish;
3483                         }
3484
3485                         r = sd_event_loop(event);
3486                         if (r < 0) {
3487                                 log_error_errno(r, "Failed to run event loop: %m");
3488                                 goto finish;
3489                         }
3490
3491                         forward = pty_forward_free(forward);
3492
3493                         if (!arg_quiet)
3494                                 putc('\n', stdout);
3495
3496                         /* Kill if it is not dead yet anyway */
3497                         terminate_machine(pid);
3498                 }
3499
3500                 /* Normally redundant, but better safe than sorry */
3501                 kill(pid, SIGKILL);
3502
3503                 r = wait_for_container(pid, &container_status);
3504                 pid = 0;
3505
3506                 if (r < 0)
3507                         /* We failed to wait for the container, or the
3508                          * container exited abnormally */
3509                         goto finish;
3510                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3511                         /* The container exited with a non-zero
3512                          * status, or with zero status and no reboot
3513                          * was requested. */
3514                         ret = r;
3515                         break;
3516                 }
3517
3518                 /* CONTAINER_REBOOTED, loop again */
3519
3520                 if (arg_keep_unit) {
3521                         /* Special handling if we are running as a
3522                          * service: instead of simply restarting the
3523                          * machine we want to restart the entire
3524                          * service, so let's inform systemd about this
3525                          * with the special exit code 133. The service
3526                          * file uses RestartForceExitStatus=133 so
3527                          * that this results in a full nspawn
3528                          * restart. This is necessary since we might
3529                          * have cgroup parameters set we want to have
3530                          * flushed out. */
3531                         ret = 133;
3532                         r = 0;
3533                         break;
3534                 }
3535         }
3536
3537 finish:
3538         sd_notify(false,
3539                   "STOPPING=1\n"
3540                   "STATUS=Terminating...");
3541
3542         loop_remove(loop_nr, &image_fd);
3543
3544         if (pid > 0)
3545                 kill(pid, SIGKILL);
3546
3547         if (remove_subvol && arg_directory) {
3548                 int k;
3549
3550                 k = btrfs_subvol_remove(arg_directory);
3551                 if (k < 0)
3552                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3553         }
3554
3555         free(arg_directory);
3556         free(arg_template);
3557         free(arg_image);
3558         free(arg_machine);
3559         free(arg_user);
3560         strv_free(arg_setenv);
3561         strv_free(arg_network_interfaces);
3562         strv_free(arg_network_macvlan);
3563         strv_free(arg_bind);
3564         strv_free(arg_bind_ro);
3565         strv_free(arg_tmpfs);
3566
3567         return r < 0 ? EXIT_FAILURE : ret;
3568 }