chiark / gitweb /
nspawn: properly unset arg_link_journal_try, when --link-journal= is specified
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95
96 #ifdef HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99
100 typedef enum ContainerStatus {
101         CONTAINER_TERMINATED,
102         CONTAINER_REBOOTED
103 } ContainerStatus;
104
105 typedef enum LinkJournal {
106         LINK_NO,
107         LINK_AUTO,
108         LINK_HOST,
109         LINK_GUEST
110 } LinkJournal;
111
112 typedef enum Volatile {
113         VOLATILE_NO,
114         VOLATILE_YES,
115         VOLATILE_STATE,
116 } Volatile;
117
118 static char *arg_directory = NULL;
119 static char *arg_template = NULL;
120 static char *arg_user = NULL;
121 static sd_id128_t arg_uuid = {};
122 static char *arg_machine = NULL;
123 static const char *arg_selinux_context = NULL;
124 static const char *arg_selinux_apifs_context = NULL;
125 static const char *arg_slice = NULL;
126 static bool arg_private_network = false;
127 static bool arg_read_only = false;
128 static bool arg_boot = false;
129 static bool arg_ephemeral = false;
130 static LinkJournal arg_link_journal = LINK_AUTO;
131 static bool arg_link_journal_try = false;
132 static uint64_t arg_retain =
133         (1ULL << CAP_CHOWN) |
134         (1ULL << CAP_DAC_OVERRIDE) |
135         (1ULL << CAP_DAC_READ_SEARCH) |
136         (1ULL << CAP_FOWNER) |
137         (1ULL << CAP_FSETID) |
138         (1ULL << CAP_IPC_OWNER) |
139         (1ULL << CAP_KILL) |
140         (1ULL << CAP_LEASE) |
141         (1ULL << CAP_LINUX_IMMUTABLE) |
142         (1ULL << CAP_NET_BIND_SERVICE) |
143         (1ULL << CAP_NET_BROADCAST) |
144         (1ULL << CAP_NET_RAW) |
145         (1ULL << CAP_SETGID) |
146         (1ULL << CAP_SETFCAP) |
147         (1ULL << CAP_SETPCAP) |
148         (1ULL << CAP_SETUID) |
149         (1ULL << CAP_SYS_ADMIN) |
150         (1ULL << CAP_SYS_CHROOT) |
151         (1ULL << CAP_SYS_NICE) |
152         (1ULL << CAP_SYS_PTRACE) |
153         (1ULL << CAP_SYS_TTY_CONFIG) |
154         (1ULL << CAP_SYS_RESOURCE) |
155         (1ULL << CAP_SYS_BOOT) |
156         (1ULL << CAP_AUDIT_WRITE) |
157         (1ULL << CAP_AUDIT_CONTROL) |
158         (1ULL << CAP_MKNOD);
159 static char **arg_bind = NULL;
160 static char **arg_bind_ro = NULL;
161 static char **arg_tmpfs = NULL;
162 static char **arg_setenv = NULL;
163 static bool arg_quiet = false;
164 static bool arg_share_system = false;
165 static bool arg_register = true;
166 static bool arg_keep_unit = false;
167 static char **arg_network_interfaces = NULL;
168 static char **arg_network_macvlan = NULL;
169 static bool arg_network_veth = false;
170 static const char *arg_network_bridge = NULL;
171 static unsigned long arg_personality = 0xffffffffLU;
172 static char *arg_image = NULL;
173 static Volatile arg_volatile = VOLATILE_NO;
174
175 static void help(void) {
176         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
177                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
178                "  -h --help                 Show this help\n"
179                "     --version              Print version string\n"
180                "  -q --quiet                Do not show status information\n"
181                "  -D --directory=PATH       Root directory for the container\n"
182                "     --template=PATH        Initialize root directory from template directory,\n"
183                "                            if missing\n"
184                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
185                "                            remove it after exit\n"
186                "  -i --image=PATH           File system device or disk image for the container\n"
187                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
188                "  -u --user=USER            Run the command under specified user or uid\n"
189                "  -M --machine=NAME         Set the machine name for the container\n"
190                "     --uuid=UUID            Set a specific machine UUID for the container\n"
191                "  -S --slice=SLICE          Place the container in the specified slice\n"
192                "     --private-network      Disable network in container\n"
193                "     --network-interface=INTERFACE\n"
194                "                            Assign an existing network interface to the\n"
195                "                            container\n"
196                "     --network-macvlan=INTERFACE\n"
197                "                            Create a macvlan network interface based on an\n"
198                "                            existing network interface to the container\n"
199                "     --network-veth         Add a virtual ethernet connection between host\n"
200                "                            and container\n"
201                "     --network-bridge=INTERFACE\n"
202                "                            Add a virtual ethernet connection between host\n"
203                "                            and container and add it to an existing bridge on\n"
204                "                            the host\n"
205                "  -Z --selinux-context=SECLABEL\n"
206                "                            Set the SELinux security context to be used by\n"
207                "                            processes in the container\n"
208                "  -L --selinux-apifs-context=SECLABEL\n"
209                "                            Set the SELinux security context to be used by\n"
210                "                            API/tmpfs file systems in the container\n"
211                "     --capability=CAP       In addition to the default, retain specified\n"
212                "                            capability\n"
213                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
214                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
215                "                            try-guest, try-host\n"
216                "  -j                        Equivalent to --link-journal=try-guest\n"
217                "     --read-only            Mount the root directory read-only\n"
218                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
219                "                            the container\n"
220                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
221                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
222                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
223                "     --share-system         Share system namespaces with host\n"
224                "     --register=BOOLEAN     Register container as machine\n"
225                "     --keep-unit            Do not register a scope for the machine, reuse\n"
226                "                            the service unit nspawn is running in\n"
227                "     --volatile[=MODE]      Run the system in volatile mode\n",
228                program_invocation_short_name);
229 }
230
231 static int set_sanitized_path(char **b, const char *path) {
232         char *p;
233
234         assert(b);
235         assert(path);
236
237         p = canonicalize_file_name(path);
238         if (!p) {
239                 if (errno != ENOENT)
240                         return -errno;
241
242                 p = path_make_absolute_cwd(path);
243                 if (!p)
244                         return -ENOMEM;
245         }
246
247         free(*b);
248         *b = path_kill_slashes(p);
249         return 0;
250 }
251
252 static int parse_argv(int argc, char *argv[]) {
253
254         enum {
255                 ARG_VERSION = 0x100,
256                 ARG_PRIVATE_NETWORK,
257                 ARG_UUID,
258                 ARG_READ_ONLY,
259                 ARG_CAPABILITY,
260                 ARG_DROP_CAPABILITY,
261                 ARG_LINK_JOURNAL,
262                 ARG_BIND,
263                 ARG_BIND_RO,
264                 ARG_TMPFS,
265                 ARG_SETENV,
266                 ARG_SHARE_SYSTEM,
267                 ARG_REGISTER,
268                 ARG_KEEP_UNIT,
269                 ARG_NETWORK_INTERFACE,
270                 ARG_NETWORK_MACVLAN,
271                 ARG_NETWORK_VETH,
272                 ARG_NETWORK_BRIDGE,
273                 ARG_PERSONALITY,
274                 ARG_VOLATILE,
275                 ARG_TEMPLATE,
276         };
277
278         static const struct option options[] = {
279                 { "help",                  no_argument,       NULL, 'h'                   },
280                 { "version",               no_argument,       NULL, ARG_VERSION           },
281                 { "directory",             required_argument, NULL, 'D'                   },
282                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
283                 { "ephemeral",             no_argument,       NULL, 'x'                   },
284                 { "user",                  required_argument, NULL, 'u'                   },
285                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
286                 { "boot",                  no_argument,       NULL, 'b'                   },
287                 { "uuid",                  required_argument, NULL, ARG_UUID              },
288                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
289                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
290                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
291                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
292                 { "bind",                  required_argument, NULL, ARG_BIND              },
293                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
294                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
295                 { "machine",               required_argument, NULL, 'M'                   },
296                 { "slice",                 required_argument, NULL, 'S'                   },
297                 { "setenv",                required_argument, NULL, ARG_SETENV            },
298                 { "selinux-context",       required_argument, NULL, 'Z'                   },
299                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
300                 { "quiet",                 no_argument,       NULL, 'q'                   },
301                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
302                 { "register",              required_argument, NULL, ARG_REGISTER          },
303                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
304                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
305                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
306                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
307                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
308                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
309                 { "image",                 required_argument, NULL, 'i'                   },
310                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
311                 {}
312         };
313
314         int c, r;
315         uint64_t plus = 0, minus = 0;
316
317         assert(argc >= 0);
318         assert(argv);
319
320         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
321
322                 switch (c) {
323
324                 case 'h':
325                         help();
326                         return 0;
327
328                 case ARG_VERSION:
329                         puts(PACKAGE_STRING);
330                         puts(SYSTEMD_FEATURES);
331                         return 0;
332
333                 case 'D':
334                         r = set_sanitized_path(&arg_directory, optarg);
335                         if (r < 0)
336                                 return log_error_errno(r, "Invalid root directory: %m");
337
338                         break;
339
340                 case ARG_TEMPLATE:
341                         r = set_sanitized_path(&arg_template, optarg);
342                         if (r < 0)
343                                 return log_error_errno(r, "Invalid template directory: %m");
344
345                         break;
346
347                 case 'i':
348                         r = set_sanitized_path(&arg_image, optarg);
349                         if (r < 0)
350                                 return log_error_errno(r, "Invalid image path: %m");
351
352                         break;
353
354                 case 'x':
355                         arg_ephemeral = true;
356                         break;
357
358                 case 'u':
359                         free(arg_user);
360                         arg_user = strdup(optarg);
361                         if (!arg_user)
362                                 return log_oom();
363
364                         break;
365
366                 case ARG_NETWORK_BRIDGE:
367                         arg_network_bridge = optarg;
368
369                         /* fall through */
370
371                 case ARG_NETWORK_VETH:
372                         arg_network_veth = true;
373                         arg_private_network = true;
374                         break;
375
376                 case ARG_NETWORK_INTERFACE:
377                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
378                                 return log_oom();
379
380                         arg_private_network = true;
381                         break;
382
383                 case ARG_NETWORK_MACVLAN:
384                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
385                                 return log_oom();
386
387                         /* fall through */
388
389                 case ARG_PRIVATE_NETWORK:
390                         arg_private_network = true;
391                         break;
392
393                 case 'b':
394                         arg_boot = true;
395                         break;
396
397                 case ARG_UUID:
398                         r = sd_id128_from_string(optarg, &arg_uuid);
399                         if (r < 0) {
400                                 log_error("Invalid UUID: %s", optarg);
401                                 return r;
402                         }
403                         break;
404
405                 case 'S':
406                         arg_slice = optarg;
407                         break;
408
409                 case 'M':
410                         if (isempty(optarg)) {
411                                 free(arg_machine);
412                                 arg_machine = NULL;
413                         } else {
414                                 if (!machine_name_is_valid(optarg)) {
415                                         log_error("Invalid machine name: %s", optarg);
416                                         return -EINVAL;
417                                 }
418
419                                 r = free_and_strdup(&arg_machine, optarg);
420                                 if (r < 0)
421                                         return log_oom();
422
423                                 break;
424                         }
425
426                 case 'Z':
427                         arg_selinux_context = optarg;
428                         break;
429
430                 case 'L':
431                         arg_selinux_apifs_context = optarg;
432                         break;
433
434                 case ARG_READ_ONLY:
435                         arg_read_only = true;
436                         break;
437
438                 case ARG_CAPABILITY:
439                 case ARG_DROP_CAPABILITY: {
440                         const char *state, *word;
441                         size_t length;
442
443                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
444                                 _cleanup_free_ char *t;
445
446                                 t = strndup(word, length);
447                                 if (!t)
448                                         return log_oom();
449
450                                 if (streq(t, "all")) {
451                                         if (c == ARG_CAPABILITY)
452                                                 plus = (uint64_t) -1;
453                                         else
454                                                 minus = (uint64_t) -1;
455                                 } else {
456                                         int cap;
457
458                                         cap = capability_from_name(t);
459                                         if (cap < 0) {
460                                                 log_error("Failed to parse capability %s.", t);
461                                                 return -EINVAL;
462                                         }
463
464                                         if (c == ARG_CAPABILITY)
465                                                 plus |= 1ULL << (uint64_t) cap;
466                                         else
467                                                 minus |= 1ULL << (uint64_t) cap;
468                                 }
469                         }
470
471                         break;
472                 }
473
474                 case 'j':
475                         arg_link_journal = LINK_GUEST;
476                         arg_link_journal_try = true;
477                         break;
478
479                 case ARG_LINK_JOURNAL:
480                         if (streq(optarg, "auto")) {
481                                 arg_link_journal = LINK_AUTO;
482                                 arg_link_journal_try = false;
483                         } else if (streq(optarg, "no")) {
484                                 arg_link_journal = LINK_NO;
485                                 arg_link_journal_try = false;
486                         } else if (streq(optarg, "guest")) {
487                                 arg_link_journal = LINK_GUEST;
488                                 arg_link_journal_try = false;
489                         } else if (streq(optarg, "host")) {
490                                 arg_link_journal = LINK_HOST;
491                                 arg_link_journal_try = false;
492                         } else if (streq(optarg, "try-guest")) {
493                                 arg_link_journal = LINK_GUEST;
494                                 arg_link_journal_try = true;
495                         } else if (streq(optarg, "try-host")) {
496                                 arg_link_journal = LINK_HOST;
497                                 arg_link_journal_try = true;
498                         } else {
499                                 log_error("Failed to parse link journal mode %s", optarg);
500                                 return -EINVAL;
501                         }
502
503                         break;
504
505                 case ARG_BIND:
506                 case ARG_BIND_RO: {
507                         _cleanup_free_ char *a = NULL, *b = NULL;
508                         char *e;
509                         char ***x;
510
511                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
512
513                         e = strchr(optarg, ':');
514                         if (e) {
515                                 a = strndup(optarg, e - optarg);
516                                 b = strdup(e + 1);
517                         } else {
518                                 a = strdup(optarg);
519                                 b = strdup(optarg);
520                         }
521
522                         if (!a || !b)
523                                 return log_oom();
524
525                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
526                                 log_error("Invalid bind mount specification: %s", optarg);
527                                 return -EINVAL;
528                         }
529
530                         r = strv_extend(x, a);
531                         if (r < 0)
532                                 return log_oom();
533
534                         r = strv_extend(x, b);
535                         if (r < 0)
536                                 return log_oom();
537
538                         break;
539                 }
540
541                 case ARG_TMPFS: {
542                         _cleanup_free_ char *a = NULL, *b = NULL;
543                         char *e;
544
545                         e = strchr(optarg, ':');
546                         if (e) {
547                                 a = strndup(optarg, e - optarg);
548                                 b = strdup(e + 1);
549                         } else {
550                                 a = strdup(optarg);
551                                 b = strdup("mode=0755");
552                         }
553
554                         if (!a || !b)
555                                 return log_oom();
556
557                         if (!path_is_absolute(a)) {
558                                 log_error("Invalid tmpfs specification: %s", optarg);
559                                 return -EINVAL;
560                         }
561
562                         r = strv_push(&arg_tmpfs, a);
563                         if (r < 0)
564                                 return log_oom();
565
566                         a = NULL;
567
568                         r = strv_push(&arg_tmpfs, b);
569                         if (r < 0)
570                                 return log_oom();
571
572                         b = NULL;
573
574                         break;
575                 }
576
577                 case ARG_SETENV: {
578                         char **n;
579
580                         if (!env_assignment_is_valid(optarg)) {
581                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
582                                 return -EINVAL;
583                         }
584
585                         n = strv_env_set(arg_setenv, optarg);
586                         if (!n)
587                                 return log_oom();
588
589                         strv_free(arg_setenv);
590                         arg_setenv = n;
591                         break;
592                 }
593
594                 case 'q':
595                         arg_quiet = true;
596                         break;
597
598                 case ARG_SHARE_SYSTEM:
599                         arg_share_system = true;
600                         break;
601
602                 case ARG_REGISTER:
603                         r = parse_boolean(optarg);
604                         if (r < 0) {
605                                 log_error("Failed to parse --register= argument: %s", optarg);
606                                 return r;
607                         }
608
609                         arg_register = r;
610                         break;
611
612                 case ARG_KEEP_UNIT:
613                         arg_keep_unit = true;
614                         break;
615
616                 case ARG_PERSONALITY:
617
618                         arg_personality = personality_from_string(optarg);
619                         if (arg_personality == 0xffffffffLU) {
620                                 log_error("Unknown or unsupported personality '%s'.", optarg);
621                                 return -EINVAL;
622                         }
623
624                         break;
625
626                 case ARG_VOLATILE:
627
628                         if (!optarg)
629                                 arg_volatile = VOLATILE_YES;
630                         else {
631                                 r = parse_boolean(optarg);
632                                 if (r < 0) {
633                                         if (streq(optarg, "state"))
634                                                 arg_volatile = VOLATILE_STATE;
635                                         else {
636                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
637                                                 return r;
638                                         }
639                                 } else
640                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
641                         }
642
643                         break;
644
645                 case '?':
646                         return -EINVAL;
647
648                 default:
649                         assert_not_reached("Unhandled option");
650                 }
651
652         if (arg_share_system)
653                 arg_register = false;
654
655         if (arg_boot && arg_share_system) {
656                 log_error("--boot and --share-system may not be combined.");
657                 return -EINVAL;
658         }
659
660         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
661                 log_error("--keep-unit may not be used when invoked from a user session.");
662                 return -EINVAL;
663         }
664
665         if (arg_directory && arg_image) {
666                 log_error("--directory= and --image= may not be combined.");
667                 return -EINVAL;
668         }
669
670         if (arg_template && arg_image) {
671                 log_error("--template= and --image= may not be combined.");
672                 return -EINVAL;
673         }
674
675         if (arg_template && !(arg_directory || arg_machine)) {
676                 log_error("--template= needs --directory= or --machine=.");
677                 return -EINVAL;
678         }
679
680         if (arg_ephemeral && arg_template) {
681                 log_error("--ephemeral and --template= may not be combined.");
682                 return -EINVAL;
683         }
684
685         if (arg_ephemeral && arg_image) {
686                 log_error("--ephemeral and --image= may not be combined.");
687                 return -EINVAL;
688         }
689
690         if (arg_volatile != VOLATILE_NO && arg_read_only) {
691                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
692                 return -EINVAL;
693         }
694
695         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
696
697         return 1;
698 }
699
700 static int mount_all(const char *dest) {
701
702         typedef struct MountPoint {
703                 const char *what;
704                 const char *where;
705                 const char *type;
706                 const char *options;
707                 unsigned long flags;
708                 bool fatal;
709         } MountPoint;
710
711         static const MountPoint mount_table[] = {
712                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
713                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
714                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
715                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
716                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
717                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
718                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
719                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
720 #ifdef HAVE_SELINUX
721                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
722                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
723 #endif
724         };
725
726         unsigned k;
727         int r = 0;
728
729         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
730                 _cleanup_free_ char *where = NULL;
731 #ifdef HAVE_SELINUX
732                 _cleanup_free_ char *options = NULL;
733 #endif
734                 const char *o;
735                 int t;
736
737                 where = strjoin(dest, "/", mount_table[k].where, NULL);
738                 if (!where)
739                         return log_oom();
740
741                 t = path_is_mount_point(where, true);
742                 if (t < 0) {
743                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
744
745                         if (r == 0)
746                                 r = t;
747
748                         continue;
749                 }
750
751                 /* Skip this entry if it is not a remount. */
752                 if (mount_table[k].what && t > 0)
753                         continue;
754
755                 t = mkdir_p(where, 0755);
756                 if (t < 0) {
757                         if (mount_table[k].fatal) {
758                                log_error_errno(t, "Failed to create directory %s: %m", where);
759
760                                 if (r == 0)
761                                         r = t;
762                         } else
763                                log_warning_errno(t, "Failed to create directory %s: %m", where);
764
765                         continue;
766                 }
767
768 #ifdef HAVE_SELINUX
769                 if (arg_selinux_apifs_context &&
770                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
771                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
772                         if (!options)
773                                 return log_oom();
774
775                         o = options;
776                 } else
777 #endif
778                         o = mount_table[k].options;
779
780
781                 if (mount(mount_table[k].what,
782                           where,
783                           mount_table[k].type,
784                           mount_table[k].flags,
785                           o) < 0) {
786
787                         if (mount_table[k].fatal) {
788                                 log_error_errno(errno, "mount(%s) failed: %m", where);
789
790                                 if (r == 0)
791                                         r = -errno;
792                         } else
793                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
794                 }
795         }
796
797         return r;
798 }
799
800 static int mount_binds(const char *dest, char **l, bool ro) {
801         char **x, **y;
802
803         STRV_FOREACH_PAIR(x, y, l) {
804                 _cleanup_free_ char *where = NULL;
805                 struct stat source_st, dest_st;
806                 int r;
807
808                 if (stat(*x, &source_st) < 0)
809                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
810
811                 where = strappend(dest, *y);
812                 if (!where)
813                         return log_oom();
814
815                 r = stat(where, &dest_st);
816                 if (r == 0) {
817                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
818                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
819                                 return -EINVAL;
820                         }
821                 } else if (errno == ENOENT) {
822                         r = mkdir_parents_label(where, 0755);
823                         if (r < 0)
824                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
825                 } else {
826                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
827                         return -errno;
828                 }
829
830                 /* Create the mount point, but be conservative -- refuse to create block
831                  * and char devices. */
832                 if (S_ISDIR(source_st.st_mode)) {
833                         r = mkdir_label(where, 0755);
834                         if (r < 0 && errno != EEXIST)
835                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
836                 } else if (S_ISFIFO(source_st.st_mode)) {
837                         r = mkfifo(where, 0644);
838                         if (r < 0 && errno != EEXIST)
839                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
840                 } else if (S_ISSOCK(source_st.st_mode)) {
841                         r = mknod(where, 0644 | S_IFSOCK, 0);
842                         if (r < 0 && errno != EEXIST)
843                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
844                 } else if (S_ISREG(source_st.st_mode)) {
845                         r = touch(where);
846                         if (r < 0)
847                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
848                 } else {
849                         log_error("Refusing to create mountpoint for file: %s", *x);
850                         return -ENOTSUP;
851                 }
852
853                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
854                         return log_error_errno(errno, "mount(%s) failed: %m", where);
855
856                 if (ro) {
857                         r = bind_remount_recursive(where, true);
858                         if (r < 0)
859                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
860                 }
861         }
862
863         return 0;
864 }
865
866 static int mount_tmpfs(const char *dest) {
867         char **i, **o;
868
869         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
870                 _cleanup_free_ char *where = NULL;
871                 int r;
872
873                 where = strappend(dest, *i);
874                 if (!where)
875                         return log_oom();
876
877                 r = mkdir_label(where, 0755);
878                 if (r < 0 && r != -EEXIST)
879                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
880
881                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
882                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
883         }
884
885         return 0;
886 }
887
888 static int setup_timezone(const char *dest) {
889         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
890         char *z, *y;
891         int r;
892
893         assert(dest);
894
895         /* Fix the timezone, if possible */
896         r = readlink_malloc("/etc/localtime", &p);
897         if (r < 0) {
898                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
899                 return 0;
900         }
901
902         z = path_startswith(p, "../usr/share/zoneinfo/");
903         if (!z)
904                 z = path_startswith(p, "/usr/share/zoneinfo/");
905         if (!z) {
906                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
907                 return 0;
908         }
909
910         where = strappend(dest, "/etc/localtime");
911         if (!where)
912                 return log_oom();
913
914         r = readlink_malloc(where, &q);
915         if (r >= 0) {
916                 y = path_startswith(q, "../usr/share/zoneinfo/");
917                 if (!y)
918                         y = path_startswith(q, "/usr/share/zoneinfo/");
919
920                 /* Already pointing to the right place? Then do nothing .. */
921                 if (y && streq(y, z))
922                         return 0;
923         }
924
925         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
926         if (!check)
927                 return log_oom();
928
929         if (access(check, F_OK) < 0) {
930                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
931                 return 0;
932         }
933
934         what = strappend("../usr/share/zoneinfo/", z);
935         if (!what)
936                 return log_oom();
937
938         r = mkdir_parents(where, 0755);
939         if (r < 0) {
940                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
941
942                 return 0;
943         }
944
945         r = unlink(where);
946         if (r < 0 && errno != ENOENT) {
947                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
948
949                 return 0;
950         }
951
952         if (symlink(what, where) < 0) {
953                 log_error_errno(errno, "Failed to correct timezone of container: %m");
954                 return 0;
955         }
956
957         return 0;
958 }
959
960 static int setup_resolv_conf(const char *dest) {
961         _cleanup_free_ char *where = NULL;
962         int r;
963
964         assert(dest);
965
966         if (arg_private_network)
967                 return 0;
968
969         /* Fix resolv.conf, if possible */
970         where = strappend(dest, "/etc/resolv.conf");
971         if (!where)
972                 return log_oom();
973
974         /* We don't really care for the results of this really. If it
975          * fails, it fails, but meh... */
976         r = mkdir_parents(where, 0755);
977         if (r < 0) {
978                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
979
980                 return 0;
981         }
982
983         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
984         if (r < 0) {
985                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
986
987                 return 0;
988         }
989
990         return 0;
991 }
992
993 static int setup_volatile_state(const char *directory) {
994         const char *p;
995         int r;
996
997         assert(directory);
998
999         if (arg_volatile != VOLATILE_STATE)
1000                 return 0;
1001
1002         /* --volatile=state means we simply overmount /var
1003            with a tmpfs, and the rest read-only. */
1004
1005         r = bind_remount_recursive(directory, true);
1006         if (r < 0)
1007                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1008
1009         p = strappenda(directory, "/var");
1010         r = mkdir(p, 0755);
1011         if (r < 0 && errno != EEXIST)
1012                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1013
1014         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1015                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1016
1017         return 0;
1018 }
1019
1020 static int setup_volatile(const char *directory) {
1021         bool tmpfs_mounted = false, bind_mounted = false;
1022         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1023         const char *f, *t;
1024         int r;
1025
1026         assert(directory);
1027
1028         if (arg_volatile != VOLATILE_YES)
1029                 return 0;
1030
1031         /* --volatile=yes means we mount a tmpfs to the root dir, and
1032            the original /usr to use inside it, and that read-only. */
1033
1034         if (!mkdtemp(template))
1035                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1036
1037         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1038                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1039                 r = -errno;
1040                 goto fail;
1041         }
1042
1043         tmpfs_mounted = true;
1044
1045         f = strappenda(directory, "/usr");
1046         t = strappenda(template, "/usr");
1047
1048         r = mkdir(t, 0755);
1049         if (r < 0 && errno != EEXIST) {
1050                 log_error_errno(errno, "Failed to create %s: %m", t);
1051                 r = -errno;
1052                 goto fail;
1053         }
1054
1055         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1056                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1057                 r = -errno;
1058                 goto fail;
1059         }
1060
1061         bind_mounted = true;
1062
1063         r = bind_remount_recursive(t, true);
1064         if (r < 0) {
1065                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1066                 goto fail;
1067         }
1068
1069         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1070                 log_error_errno(errno, "Failed to move root mount: %m");
1071                 r = -errno;
1072                 goto fail;
1073         }
1074
1075         rmdir(template);
1076
1077         return 0;
1078
1079 fail:
1080         if (bind_mounted)
1081                 umount(t);
1082         if (tmpfs_mounted)
1083                 umount(template);
1084         rmdir(template);
1085         return r;
1086 }
1087
1088 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1089
1090         snprintf(s, 37,
1091                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1092                  SD_ID128_FORMAT_VAL(id));
1093
1094         return s;
1095 }
1096
1097 static int setup_boot_id(const char *dest) {
1098         _cleanup_free_ char *from = NULL, *to = NULL;
1099         sd_id128_t rnd = {};
1100         char as_uuid[37];
1101         int r;
1102
1103         assert(dest);
1104
1105         if (arg_share_system)
1106                 return 0;
1107
1108         /* Generate a new randomized boot ID, so that each boot-up of
1109          * the container gets a new one */
1110
1111         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1112         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1113         if (!from || !to)
1114                 return log_oom();
1115
1116         r = sd_id128_randomize(&rnd);
1117         if (r < 0)
1118                 return log_error_errno(r, "Failed to generate random boot id: %m");
1119
1120         id128_format_as_uuid(rnd, as_uuid);
1121
1122         r = write_string_file(from, as_uuid);
1123         if (r < 0)
1124                 return log_error_errno(r, "Failed to write boot id: %m");
1125
1126         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1127                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1128                 r = -errno;
1129         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1130                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1131
1132         unlink(from);
1133         return r;
1134 }
1135
1136 static int copy_devnodes(const char *dest) {
1137
1138         static const char devnodes[] =
1139                 "null\0"
1140                 "zero\0"
1141                 "full\0"
1142                 "random\0"
1143                 "urandom\0"
1144                 "tty\0"
1145                 "net/tun\0";
1146
1147         const char *d;
1148         int r = 0;
1149         _cleanup_umask_ mode_t u;
1150
1151         assert(dest);
1152
1153         u = umask(0000);
1154
1155         NULSTR_FOREACH(d, devnodes) {
1156                 _cleanup_free_ char *from = NULL, *to = NULL;
1157                 struct stat st;
1158
1159                 from = strappend("/dev/", d);
1160                 to = strjoin(dest, "/dev/", d, NULL);
1161                 if (!from || !to)
1162                         return log_oom();
1163
1164                 if (stat(from, &st) < 0) {
1165
1166                         if (errno != ENOENT)
1167                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1168
1169                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1170
1171                         log_error("%s is not a char or block device, cannot copy", from);
1172                         return -EIO;
1173
1174                 } else {
1175                         r = mkdir_parents(to, 0775);
1176                         if (r < 0) {
1177                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1178                                 return -r;
1179                         }
1180
1181                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1182                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1183                 }
1184         }
1185
1186         return r;
1187 }
1188
1189 static int setup_ptmx(const char *dest) {
1190         _cleanup_free_ char *p = NULL;
1191
1192         p = strappend(dest, "/dev/ptmx");
1193         if (!p)
1194                 return log_oom();
1195
1196         if (symlink("pts/ptmx", p) < 0)
1197                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1198
1199         return 0;
1200 }
1201
1202 static int setup_dev_console(const char *dest, const char *console) {
1203         _cleanup_umask_ mode_t u;
1204         const char *to;
1205         struct stat st;
1206         int r;
1207
1208         assert(dest);
1209         assert(console);
1210
1211         u = umask(0000);
1212
1213         if (stat("/dev/null", &st) < 0)
1214                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1215
1216         r = chmod_and_chown(console, 0600, 0, 0);
1217         if (r < 0)
1218                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1219
1220         /* We need to bind mount the right tty to /dev/console since
1221          * ptys can only exist on pts file systems. To have something
1222          * to bind mount things on we create a device node first, and
1223          * use /dev/null for that since we the cgroups device policy
1224          * allows us to create that freely, while we cannot create
1225          * /dev/console. (Note that the major minor doesn't actually
1226          * matter here, since we mount it over anyway). */
1227
1228         to = strappenda(dest, "/dev/console");
1229         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1230                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1231
1232         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1233                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1234
1235         return 0;
1236 }
1237
1238 static int setup_kmsg(const char *dest, int kmsg_socket) {
1239         _cleanup_free_ char *from = NULL, *to = NULL;
1240         int r, fd, k;
1241         _cleanup_umask_ mode_t u;
1242         union {
1243                 struct cmsghdr cmsghdr;
1244                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1245         } control = {};
1246         struct msghdr mh = {
1247                 .msg_control = &control,
1248                 .msg_controllen = sizeof(control),
1249         };
1250         struct cmsghdr *cmsg;
1251
1252         assert(dest);
1253         assert(kmsg_socket >= 0);
1254
1255         u = umask(0000);
1256
1257         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1258          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1259          * on the reading side behave very similar to /proc/kmsg,
1260          * their writing side behaves differently from /dev/kmsg in
1261          * that writing blocks when nothing is reading. In order to
1262          * avoid any problems with containers deadlocking due to this
1263          * we simply make /dev/kmsg unavailable to the container. */
1264         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1265             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1266                 return log_oom();
1267
1268         if (mkfifo(from, 0600) < 0)
1269                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1270
1271         r = chmod_and_chown(from, 0600, 0, 0);
1272         if (r < 0)
1273                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1274
1275         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1276                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1277
1278         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1279         if (fd < 0)
1280                 return log_error_errno(errno, "Failed to open fifo: %m");
1281
1282         cmsg = CMSG_FIRSTHDR(&mh);
1283         cmsg->cmsg_level = SOL_SOCKET;
1284         cmsg->cmsg_type = SCM_RIGHTS;
1285         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1286         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1287
1288         mh.msg_controllen = cmsg->cmsg_len;
1289
1290         /* Store away the fd in the socket, so that it stays open as
1291          * long as we run the child */
1292         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1293         safe_close(fd);
1294
1295         if (k < 0)
1296                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1297
1298         /* And now make the FIFO unavailable as /dev/kmsg... */
1299         unlink(from);
1300         return 0;
1301 }
1302
1303 static int setup_hostname(void) {
1304
1305         if (arg_share_system)
1306                 return 0;
1307
1308         if (sethostname_idempotent(arg_machine) < 0)
1309                 return -errno;
1310
1311         return 0;
1312 }
1313
1314 static int setup_journal(const char *directory) {
1315         sd_id128_t machine_id, this_id;
1316         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1317         char *id;
1318         int r;
1319
1320         p = strappend(directory, "/etc/machine-id");
1321         if (!p)
1322                 return log_oom();
1323
1324         r = read_one_line_file(p, &b);
1325         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1326                 return 0;
1327         else if (r < 0)
1328                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1329
1330         id = strstrip(b);
1331         if (isempty(id) && arg_link_journal == LINK_AUTO)
1332                 return 0;
1333
1334         /* Verify validity */
1335         r = sd_id128_from_string(id, &machine_id);
1336         if (r < 0)
1337                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1338
1339         r = sd_id128_get_machine(&this_id);
1340         if (r < 0)
1341                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1342
1343         if (sd_id128_equal(machine_id, this_id)) {
1344                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1345                          "Host and machine ids are equal (%s): refusing to link journals", id);
1346                 if (arg_link_journal == LINK_AUTO)
1347                         return 0;
1348                 return
1349                         -EEXIST;
1350         }
1351
1352         if (arg_link_journal == LINK_NO)
1353                 return 0;
1354
1355         free(p);
1356         p = strappend("/var/log/journal/", id);
1357         q = strjoin(directory, "/var/log/journal/", id, NULL);
1358         if (!p || !q)
1359                 return log_oom();
1360
1361         if (path_is_mount_point(p, false) > 0) {
1362                 if (arg_link_journal != LINK_AUTO) {
1363                         log_error("%s: already a mount point, refusing to use for journal", p);
1364                         return -EEXIST;
1365                 }
1366
1367                 return 0;
1368         }
1369
1370         if (path_is_mount_point(q, false) > 0) {
1371                 if (arg_link_journal != LINK_AUTO) {
1372                         log_error("%s: already a mount point, refusing to use for journal", q);
1373                         return -EEXIST;
1374                 }
1375
1376                 return 0;
1377         }
1378
1379         r = readlink_and_make_absolute(p, &d);
1380         if (r >= 0) {
1381                 if ((arg_link_journal == LINK_GUEST ||
1382                      arg_link_journal == LINK_AUTO) &&
1383                     path_equal(d, q)) {
1384
1385                         r = mkdir_p(q, 0755);
1386                         if (r < 0)
1387                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1388                         return 0;
1389                 }
1390
1391                 if (unlink(p) < 0)
1392                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1393         } else if (r == -EINVAL) {
1394
1395                 if (arg_link_journal == LINK_GUEST &&
1396                     rmdir(p) < 0) {
1397
1398                         if (errno == ENOTDIR) {
1399                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1400                                 return r;
1401                         } else {
1402                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1403                                 return -errno;
1404                         }
1405                 }
1406         } else if (r != -ENOENT) {
1407                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1408                 return r;
1409         }
1410
1411         if (arg_link_journal == LINK_GUEST) {
1412
1413                 if (symlink(q, p) < 0) {
1414                         if (arg_link_journal_try) {
1415                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1416                                 return 0;
1417                         } else {
1418                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1419                                 return -errno;
1420                         }
1421                 }
1422
1423                 r = mkdir_p(q, 0755);
1424                 if (r < 0)
1425                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1426                 return 0;
1427         }
1428
1429         if (arg_link_journal == LINK_HOST) {
1430                 /* don't create parents here -- if the host doesn't have
1431                  * permanent journal set up, don't force it here */
1432                 r = mkdir(p, 0755);
1433                 if (r < 0) {
1434                         if (arg_link_journal_try) {
1435                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1436                                 return 0;
1437                         } else {
1438                                 log_error_errno(errno, "Failed to create %s: %m", p);
1439                                 return r;
1440                         }
1441                 }
1442
1443         } else if (access(p, F_OK) < 0)
1444                 return 0;
1445
1446         if (dir_is_empty(q) == 0)
1447                 log_warning("%s is not empty, proceeding anyway.", q);
1448
1449         r = mkdir_p(q, 0755);
1450         if (r < 0) {
1451                 log_error_errno(errno, "Failed to create %s: %m", q);
1452                 return r;
1453         }
1454
1455         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1456                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1457
1458         return 0;
1459 }
1460
1461 static int drop_capabilities(void) {
1462         return capability_bounding_set_drop(~arg_retain, false);
1463 }
1464
1465 static int register_machine(pid_t pid, int local_ifindex) {
1466         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1467         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1468         int r;
1469
1470         if (!arg_register)
1471                 return 0;
1472
1473         r = sd_bus_default_system(&bus);
1474         if (r < 0)
1475                 return log_error_errno(r, "Failed to open system bus: %m");
1476
1477         if (arg_keep_unit) {
1478                 r = sd_bus_call_method(
1479                                 bus,
1480                                 "org.freedesktop.machine1",
1481                                 "/org/freedesktop/machine1",
1482                                 "org.freedesktop.machine1.Manager",
1483                                 "RegisterMachineWithNetwork",
1484                                 &error,
1485                                 NULL,
1486                                 "sayssusai",
1487                                 arg_machine,
1488                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1489                                 "nspawn",
1490                                 "container",
1491                                 (uint32_t) pid,
1492                                 strempty(arg_directory),
1493                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1494         } else {
1495                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1496
1497                 r = sd_bus_message_new_method_call(
1498                                 bus,
1499                                 &m,
1500                                 "org.freedesktop.machine1",
1501                                 "/org/freedesktop/machine1",
1502                                 "org.freedesktop.machine1.Manager",
1503                                 "CreateMachineWithNetwork");
1504                 if (r < 0)
1505                         return log_error_errno(r, "Failed to create message: %m");
1506
1507                 r = sd_bus_message_append(
1508                                 m,
1509                                 "sayssusai",
1510                                 arg_machine,
1511                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1512                                 "nspawn",
1513                                 "container",
1514                                 (uint32_t) pid,
1515                                 strempty(arg_directory),
1516                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1517                 if (r < 0)
1518                         return log_error_errno(r, "Failed to append message arguments: %m");
1519
1520                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1521                 if (r < 0)
1522                         return log_error_errno(r, "Failed to open container: %m");
1523
1524                 if (!isempty(arg_slice)) {
1525                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1526                         if (r < 0)
1527                                 return log_error_errno(r, "Failed to append slice: %m");
1528                 }
1529
1530                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1531                 if (r < 0)
1532                         return log_error_errno(r, "Failed to add device policy: %m");
1533
1534                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1535                                           /* Allow the container to
1536                                            * access and create the API
1537                                            * device nodes, so that
1538                                            * PrivateDevices= in the
1539                                            * container can work
1540                                            * fine */
1541                                           "/dev/null", "rwm",
1542                                           "/dev/zero", "rwm",
1543                                           "/dev/full", "rwm",
1544                                           "/dev/random", "rwm",
1545                                           "/dev/urandom", "rwm",
1546                                           "/dev/tty", "rwm",
1547                                           "/dev/net/tun", "rwm",
1548                                           /* Allow the container
1549                                            * access to ptys. However,
1550                                            * do not permit the
1551                                            * container to ever create
1552                                            * these device nodes. */
1553                                           "/dev/pts/ptmx", "rw",
1554                                           "char-pts", "rw");
1555                 if (r < 0)
1556                         return log_error_errno(r, "Failed to add device whitelist: %m");
1557
1558                 r = sd_bus_message_close_container(m);
1559                 if (r < 0)
1560                         return log_error_errno(r, "Failed to close container: %m");
1561
1562                 r = sd_bus_call(bus, m, 0, &error, NULL);
1563         }
1564
1565         if (r < 0) {
1566                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1567                 return r;
1568         }
1569
1570         return 0;
1571 }
1572
1573 static int terminate_machine(pid_t pid) {
1574         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1575         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1576         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1577         const char *path;
1578         int r;
1579
1580         if (!arg_register)
1581                 return 0;
1582
1583         r = sd_bus_default_system(&bus);
1584         if (r < 0)
1585                 return log_error_errno(r, "Failed to open system bus: %m");
1586
1587         r = sd_bus_call_method(
1588                         bus,
1589                         "org.freedesktop.machine1",
1590                         "/org/freedesktop/machine1",
1591                         "org.freedesktop.machine1.Manager",
1592                         "GetMachineByPID",
1593                         &error,
1594                         &reply,
1595                         "u",
1596                         (uint32_t) pid);
1597         if (r < 0) {
1598                 /* Note that the machine might already have been
1599                  * cleaned up automatically, hence don't consider it a
1600                  * failure if we cannot get the machine object. */
1601                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1602                 return 0;
1603         }
1604
1605         r = sd_bus_message_read(reply, "o", &path);
1606         if (r < 0)
1607                 return bus_log_parse_error(r);
1608
1609         r = sd_bus_call_method(
1610                         bus,
1611                         "org.freedesktop.machine1",
1612                         path,
1613                         "org.freedesktop.machine1.Machine",
1614                         "Terminate",
1615                         &error,
1616                         NULL,
1617                         NULL);
1618         if (r < 0) {
1619                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1620                 return 0;
1621         }
1622
1623         return 0;
1624 }
1625
1626 static int reset_audit_loginuid(void) {
1627         _cleanup_free_ char *p = NULL;
1628         int r;
1629
1630         if (arg_share_system)
1631                 return 0;
1632
1633         r = read_one_line_file("/proc/self/loginuid", &p);
1634         if (r == -ENOENT)
1635                 return 0;
1636         if (r < 0)
1637                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1638
1639         /* Already reset? */
1640         if (streq(p, "4294967295"))
1641                 return 0;
1642
1643         r = write_string_file("/proc/self/loginuid", "4294967295");
1644         if (r < 0) {
1645                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1646                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1647                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1648                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1649                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1650
1651                 sleep(5);
1652         }
1653
1654         return 0;
1655 }
1656
1657 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1658 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1659 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1660
1661 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1662         uint8_t result[8];
1663         size_t l, sz;
1664         uint8_t *v, *i;
1665         int r;
1666
1667         l = strlen(arg_machine);
1668         sz = sizeof(sd_id128_t) + l;
1669         if (idx > 0)
1670                 sz += sizeof(idx);
1671
1672         v = alloca(sz);
1673
1674         /* fetch some persistent data unique to the host */
1675         r = sd_id128_get_machine((sd_id128_t*) v);
1676         if (r < 0)
1677                 return r;
1678
1679         /* combine with some data unique (on this host) to this
1680          * container instance */
1681         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1682         if (idx > 0) {
1683                 idx = htole64(idx);
1684                 memcpy(i, &idx, sizeof(idx));
1685         }
1686
1687         /* Let's hash the host machine ID plus the container name. We
1688          * use a fixed, but originally randomly created hash key here. */
1689         siphash24(result, v, sz, hash_key.bytes);
1690
1691         assert_cc(ETH_ALEN <= sizeof(result));
1692         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1693
1694         /* see eth_random_addr in the kernel */
1695         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1696         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1697
1698         return 0;
1699 }
1700
1701 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1702         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1703         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1704         struct ether_addr mac_host, mac_container;
1705         int r, i;
1706
1707         if (!arg_private_network)
1708                 return 0;
1709
1710         if (!arg_network_veth)
1711                 return 0;
1712
1713         /* Use two different interface name prefixes depending whether
1714          * we are in bridge mode or not. */
1715         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1716                  arg_network_bridge ? "vb" : "ve", arg_machine);
1717
1718         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1719         if (r < 0)
1720                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1721
1722         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1723         if (r < 0)
1724                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1725
1726         r = sd_rtnl_open(&rtnl, 0);
1727         if (r < 0)
1728                 return log_error_errno(r, "Failed to connect to netlink: %m");
1729
1730         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1731         if (r < 0)
1732                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1733
1734         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1735         if (r < 0)
1736                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1737
1738         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1739         if (r < 0)
1740                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1741
1742         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1743         if (r < 0)
1744                 return log_error_errno(r, "Failed to open netlink container: %m");
1745
1746         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1747         if (r < 0)
1748                 return log_error_errno(r, "Failed to open netlink container: %m");
1749
1750         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to open netlink container: %m");
1753
1754         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1755         if (r < 0)
1756                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1757
1758         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1759         if (r < 0)
1760                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1761
1762         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1763         if (r < 0)
1764                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1765
1766         r = sd_rtnl_message_close_container(m);
1767         if (r < 0)
1768                 return log_error_errno(r, "Failed to close netlink container: %m");
1769
1770         r = sd_rtnl_message_close_container(m);
1771         if (r < 0)
1772                 return log_error_errno(r, "Failed to close netlink container: %m");
1773
1774         r = sd_rtnl_message_close_container(m);
1775         if (r < 0)
1776                 return log_error_errno(r, "Failed to close netlink container: %m");
1777
1778         r = sd_rtnl_call(rtnl, m, 0, NULL);
1779         if (r < 0)
1780                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1781
1782         i = (int) if_nametoindex(iface_name);
1783         if (i <= 0)
1784                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1785
1786         *ifi = i;
1787
1788         return 0;
1789 }
1790
1791 static int setup_bridge(const char veth_name[], int *ifi) {
1792         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1793         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1794         int r, bridge;
1795
1796         if (!arg_private_network)
1797                 return 0;
1798
1799         if (!arg_network_veth)
1800                 return 0;
1801
1802         if (!arg_network_bridge)
1803                 return 0;
1804
1805         bridge = (int) if_nametoindex(arg_network_bridge);
1806         if (bridge <= 0)
1807                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1808
1809         *ifi = bridge;
1810
1811         r = sd_rtnl_open(&rtnl, 0);
1812         if (r < 0)
1813                 return log_error_errno(r, "Failed to connect to netlink: %m");
1814
1815         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1816         if (r < 0)
1817                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1818
1819         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1820         if (r < 0)
1821                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1822
1823         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1824         if (r < 0)
1825                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1826
1827         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1828         if (r < 0)
1829                 return log_error_errno(r, "Failed to add netlink master field: %m");
1830
1831         r = sd_rtnl_call(rtnl, m, 0, NULL);
1832         if (r < 0)
1833                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1834
1835         return 0;
1836 }
1837
1838 static int parse_interface(struct udev *udev, const char *name) {
1839         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1840         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1841         int ifi;
1842
1843         ifi = (int) if_nametoindex(name);
1844         if (ifi <= 0)
1845                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1846
1847         sprintf(ifi_str, "n%i", ifi);
1848         d = udev_device_new_from_device_id(udev, ifi_str);
1849         if (!d)
1850                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1851
1852         if (udev_device_get_is_initialized(d) <= 0) {
1853                 log_error("Network interface %s is not initialized yet.", name);
1854                 return -EBUSY;
1855         }
1856
1857         return ifi;
1858 }
1859
1860 static int move_network_interfaces(pid_t pid) {
1861         _cleanup_udev_unref_ struct udev *udev = NULL;
1862         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1863         char **i;
1864         int r;
1865
1866         if (!arg_private_network)
1867                 return 0;
1868
1869         if (strv_isempty(arg_network_interfaces))
1870                 return 0;
1871
1872         r = sd_rtnl_open(&rtnl, 0);
1873         if (r < 0)
1874                 return log_error_errno(r, "Failed to connect to netlink: %m");
1875
1876         udev = udev_new();
1877         if (!udev) {
1878                 log_error("Failed to connect to udev.");
1879                 return -ENOMEM;
1880         }
1881
1882         STRV_FOREACH(i, arg_network_interfaces) {
1883                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1884                 int ifi;
1885
1886                 ifi = parse_interface(udev, *i);
1887                 if (ifi < 0)
1888                         return ifi;
1889
1890                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1891                 if (r < 0)
1892                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1893
1894                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1895                 if (r < 0)
1896                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1897
1898                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1899                 if (r < 0)
1900                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1901         }
1902
1903         return 0;
1904 }
1905
1906 static int setup_macvlan(pid_t pid) {
1907         _cleanup_udev_unref_ struct udev *udev = NULL;
1908         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1909         unsigned idx = 0;
1910         char **i;
1911         int r;
1912
1913         if (!arg_private_network)
1914                 return 0;
1915
1916         if (strv_isempty(arg_network_macvlan))
1917                 return 0;
1918
1919         r = sd_rtnl_open(&rtnl, 0);
1920         if (r < 0)
1921                 return log_error_errno(r, "Failed to connect to netlink: %m");
1922
1923         udev = udev_new();
1924         if (!udev) {
1925                 log_error("Failed to connect to udev.");
1926                 return -ENOMEM;
1927         }
1928
1929         STRV_FOREACH(i, arg_network_macvlan) {
1930                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1931                 _cleanup_free_ char *n = NULL;
1932                 struct ether_addr mac;
1933                 int ifi;
1934
1935                 ifi = parse_interface(udev, *i);
1936                 if (ifi < 0)
1937                         return ifi;
1938
1939                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1940                 if (r < 0)
1941                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1942
1943                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1944                 if (r < 0)
1945                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1946
1947                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1948                 if (r < 0)
1949                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1950
1951                 n = strappend("mv-", *i);
1952                 if (!n)
1953                         return log_oom();
1954
1955                 strshorten(n, IFNAMSIZ-1);
1956
1957                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1958                 if (r < 0)
1959                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1960
1961                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1962                 if (r < 0)
1963                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1964
1965                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1966                 if (r < 0)
1967                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1968
1969                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1970                 if (r < 0)
1971                         return log_error_errno(r, "Failed to open netlink container: %m");
1972
1973                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1974                 if (r < 0)
1975                         return log_error_errno(r, "Failed to open netlink container: %m");
1976
1977                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1978                 if (r < 0)
1979                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1980
1981                 r = sd_rtnl_message_close_container(m);
1982                 if (r < 0)
1983                         return log_error_errno(r, "Failed to close netlink container: %m");
1984
1985                 r = sd_rtnl_message_close_container(m);
1986                 if (r < 0)
1987                         return log_error_errno(r, "Failed to close netlink container: %m");
1988
1989                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1990                 if (r < 0)
1991                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1992         }
1993
1994         return 0;
1995 }
1996
1997 static int setup_seccomp(void) {
1998
1999 #ifdef HAVE_SECCOMP
2000         static const int blacklist[] = {
2001                 SCMP_SYS(kexec_load),
2002                 SCMP_SYS(open_by_handle_at),
2003                 SCMP_SYS(init_module),
2004                 SCMP_SYS(finit_module),
2005                 SCMP_SYS(delete_module),
2006                 SCMP_SYS(iopl),
2007                 SCMP_SYS(ioperm),
2008                 SCMP_SYS(swapon),
2009                 SCMP_SYS(swapoff),
2010         };
2011
2012         scmp_filter_ctx seccomp;
2013         unsigned i;
2014         int r;
2015
2016         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2017         if (!seccomp)
2018                 return log_oom();
2019
2020         r = seccomp_add_secondary_archs(seccomp);
2021         if (r < 0) {
2022                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2023                 goto finish;
2024         }
2025
2026         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2027                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2028                 if (r == -EFAULT)
2029                         continue; /* unknown syscall */
2030                 if (r < 0) {
2031                         log_error_errno(r, "Failed to block syscall: %m");
2032                         goto finish;
2033                 }
2034         }
2035
2036         /*
2037            Audit is broken in containers, much of the userspace audit
2038            hookup will fail if running inside a container. We don't
2039            care and just turn off creation of audit sockets.
2040
2041            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2042            with EAFNOSUPPORT which audit userspace uses as indication
2043            that audit is disabled in the kernel.
2044          */
2045
2046         r = seccomp_rule_add(
2047                         seccomp,
2048                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2049                         SCMP_SYS(socket),
2050                         2,
2051                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2052                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2053         if (r < 0) {
2054                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2055                 goto finish;
2056         }
2057
2058         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2059         if (r < 0) {
2060                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2061                 goto finish;
2062         }
2063
2064         r = seccomp_load(seccomp);
2065         if (r < 0)
2066                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2067
2068 finish:
2069         seccomp_release(seccomp);
2070         return r;
2071 #else
2072         return 0;
2073 #endif
2074
2075 }
2076
2077 static int setup_image(char **device_path, int *loop_nr) {
2078         struct loop_info64 info = {
2079                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2080         };
2081         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2082         _cleanup_free_ char* loopdev = NULL;
2083         struct stat st;
2084         int r, nr;
2085
2086         assert(device_path);
2087         assert(loop_nr);
2088         assert(arg_image);
2089
2090         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2091         if (fd < 0)
2092                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2093
2094         if (fstat(fd, &st) < 0)
2095                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2096
2097         if (S_ISBLK(st.st_mode)) {
2098                 char *p;
2099
2100                 p = strdup(arg_image);
2101                 if (!p)
2102                         return log_oom();
2103
2104                 *device_path = p;
2105
2106                 *loop_nr = -1;
2107
2108                 r = fd;
2109                 fd = -1;
2110
2111                 return r;
2112         }
2113
2114         if (!S_ISREG(st.st_mode)) {
2115                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2116                 return -EINVAL;
2117         }
2118
2119         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2120         if (control < 0)
2121                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2122
2123         nr = ioctl(control, LOOP_CTL_GET_FREE);
2124         if (nr < 0)
2125                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2126
2127         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2128                 return log_oom();
2129
2130         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2131         if (loop < 0)
2132                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2133
2134         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2135                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2136
2137         if (arg_read_only)
2138                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2139
2140         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2141                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2142
2143         *device_path = loopdev;
2144         loopdev = NULL;
2145
2146         *loop_nr = nr;
2147
2148         r = loop;
2149         loop = -1;
2150
2151         return r;
2152 }
2153
2154 static int dissect_image(
2155                 int fd,
2156                 char **root_device, bool *root_device_rw,
2157                 char **home_device, bool *home_device_rw,
2158                 char **srv_device, bool *srv_device_rw,
2159                 bool *secondary) {
2160
2161 #ifdef HAVE_BLKID
2162         int home_nr = -1, srv_nr = -1;
2163 #ifdef GPT_ROOT_NATIVE
2164         int root_nr = -1;
2165 #endif
2166 #ifdef GPT_ROOT_SECONDARY
2167         int secondary_root_nr = -1;
2168 #endif
2169
2170         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2171         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2172         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2173         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2174         _cleanup_udev_unref_ struct udev *udev = NULL;
2175         struct udev_list_entry *first, *item;
2176         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2177         const char *pttype = NULL;
2178         blkid_partlist pl;
2179         struct stat st;
2180         int r;
2181
2182         assert(fd >= 0);
2183         assert(root_device);
2184         assert(home_device);
2185         assert(srv_device);
2186         assert(secondary);
2187         assert(arg_image);
2188
2189         b = blkid_new_probe();
2190         if (!b)
2191                 return log_oom();
2192
2193         errno = 0;
2194         r = blkid_probe_set_device(b, fd, 0, 0);
2195         if (r != 0) {
2196                 if (errno == 0)
2197                         return log_oom();
2198
2199                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2200                 return -errno;
2201         }
2202
2203         blkid_probe_enable_partitions(b, 1);
2204         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2205
2206         errno = 0;
2207         r = blkid_do_safeprobe(b);
2208         if (r == -2 || r == 1) {
2209                 log_error("Failed to identify any partition table on %s.\n"
2210                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2211                 return -EINVAL;
2212         } else if (r != 0) {
2213                 if (errno == 0)
2214                         errno = EIO;
2215                 log_error_errno(errno, "Failed to probe: %m");
2216                 return -errno;
2217         }
2218
2219         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2220         if (!streq_ptr(pttype, "gpt")) {
2221                 log_error("Image %s does not carry a GUID Partition Table.\n"
2222                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2223                 return -EINVAL;
2224         }
2225
2226         errno = 0;
2227         pl = blkid_probe_get_partitions(b);
2228         if (!pl) {
2229                 if (errno == 0)
2230                         return log_oom();
2231
2232                 log_error("Failed to list partitions of %s", arg_image);
2233                 return -errno;
2234         }
2235
2236         udev = udev_new();
2237         if (!udev)
2238                 return log_oom();
2239
2240         if (fstat(fd, &st) < 0)
2241                 return log_error_errno(errno, "Failed to stat block device: %m");
2242
2243         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2244         if (!d)
2245                 return log_oom();
2246
2247         e = udev_enumerate_new(udev);
2248         if (!e)
2249                 return log_oom();
2250
2251         r = udev_enumerate_add_match_parent(e, d);
2252         if (r < 0)
2253                 return log_oom();
2254
2255         r = udev_enumerate_scan_devices(e);
2256         if (r < 0)
2257                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2258
2259         first = udev_enumerate_get_list_entry(e);
2260         udev_list_entry_foreach(item, first) {
2261                 _cleanup_udev_device_unref_ struct udev_device *q;
2262                 const char *stype, *node;
2263                 unsigned long long flags;
2264                 sd_id128_t type_id;
2265                 blkid_partition pp;
2266                 dev_t qn;
2267                 int nr;
2268
2269                 errno = 0;
2270                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2271                 if (!q) {
2272                         if (!errno)
2273                                 errno = ENOMEM;
2274
2275                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2276                         return -errno;
2277                 }
2278
2279                 qn = udev_device_get_devnum(q);
2280                 if (major(qn) == 0)
2281                         continue;
2282
2283                 if (st.st_rdev == qn)
2284                         continue;
2285
2286                 node = udev_device_get_devnode(q);
2287                 if (!node)
2288                         continue;
2289
2290                 pp = blkid_partlist_devno_to_partition(pl, qn);
2291                 if (!pp)
2292                         continue;
2293
2294                 flags = blkid_partition_get_flags(pp);
2295                 if (flags & GPT_FLAG_NO_AUTO)
2296                         continue;
2297
2298                 nr = blkid_partition_get_partno(pp);
2299                 if (nr < 0)
2300                         continue;
2301
2302                 stype = blkid_partition_get_type_string(pp);
2303                 if (!stype)
2304                         continue;
2305
2306                 if (sd_id128_from_string(stype, &type_id) < 0)
2307                         continue;
2308
2309                 if (sd_id128_equal(type_id, GPT_HOME)) {
2310
2311                         if (home && nr >= home_nr)
2312                                 continue;
2313
2314                         home_nr = nr;
2315                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2316
2317                         free(home);
2318                         home = strdup(node);
2319                         if (!home)
2320                                 return log_oom();
2321                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2322
2323                         if (srv && nr >= srv_nr)
2324                                 continue;
2325
2326                         srv_nr = nr;
2327                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2328
2329                         free(srv);
2330                         srv = strdup(node);
2331                         if (!srv)
2332                                 return log_oom();
2333                 }
2334 #ifdef GPT_ROOT_NATIVE
2335                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2336
2337                         if (root && nr >= root_nr)
2338                                 continue;
2339
2340                         root_nr = nr;
2341                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2342
2343                         free(root);
2344                         root = strdup(node);
2345                         if (!root)
2346                                 return log_oom();
2347                 }
2348 #endif
2349 #ifdef GPT_ROOT_SECONDARY
2350                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2351
2352                         if (secondary_root && nr >= secondary_root_nr)
2353                                 continue;
2354
2355                         secondary_root_nr = nr;
2356                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2357
2358
2359                         free(secondary_root);
2360                         secondary_root = strdup(node);
2361                         if (!secondary_root)
2362                                 return log_oom();
2363                 }
2364 #endif
2365         }
2366
2367         if (!root && !secondary_root) {
2368                 log_error("Failed to identify root partition in disk image %s.\n"
2369                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2370                 return -EINVAL;
2371         }
2372
2373         if (root) {
2374                 *root_device = root;
2375                 root = NULL;
2376
2377                 *root_device_rw = root_rw;
2378                 *secondary = false;
2379         } else if (secondary_root) {
2380                 *root_device = secondary_root;
2381                 secondary_root = NULL;
2382
2383                 *root_device_rw = secondary_root_rw;
2384                 *secondary = true;
2385         }
2386
2387         if (home) {
2388                 *home_device = home;
2389                 home = NULL;
2390
2391                 *home_device_rw = home_rw;
2392         }
2393
2394         if (srv) {
2395                 *srv_device = srv;
2396                 srv = NULL;
2397
2398                 *srv_device_rw = srv_rw;
2399         }
2400
2401         return 0;
2402 #else
2403         log_error("--image= is not supported, compiled without blkid support.");
2404         return -ENOTSUP;
2405 #endif
2406 }
2407
2408 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2409 #ifdef HAVE_BLKID
2410         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2411         const char *fstype, *p;
2412         int r;
2413
2414         assert(what);
2415         assert(where);
2416
2417         if (arg_read_only)
2418                 rw = false;
2419
2420         if (directory)
2421                 p = strappenda(where, directory);
2422         else
2423                 p = where;
2424
2425         errno = 0;
2426         b = blkid_new_probe_from_filename(what);
2427         if (!b) {
2428                 if (errno == 0)
2429                         return log_oom();
2430                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2431                 return -errno;
2432         }
2433
2434         blkid_probe_enable_superblocks(b, 1);
2435         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2436
2437         errno = 0;
2438         r = blkid_do_safeprobe(b);
2439         if (r == -1 || r == 1) {
2440                 log_error("Cannot determine file system type of %s", what);
2441                 return -EINVAL;
2442         } else if (r != 0) {
2443                 if (errno == 0)
2444                         errno = EIO;
2445                 log_error_errno(errno, "Failed to probe %s: %m", what);
2446                 return -errno;
2447         }
2448
2449         errno = 0;
2450         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2451                 if (errno == 0)
2452                         errno = EINVAL;
2453                 log_error("Failed to determine file system type of %s", what);
2454                 return -errno;
2455         }
2456
2457         if (streq(fstype, "crypto_LUKS")) {
2458                 log_error("nspawn currently does not support LUKS disk images.");
2459                 return -ENOTSUP;
2460         }
2461
2462         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2463                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2464
2465         return 0;
2466 #else
2467         log_error("--image= is not supported, compiled without blkid support.");
2468         return -ENOTSUP;
2469 #endif
2470 }
2471
2472 static int mount_devices(
2473                 const char *where,
2474                 const char *root_device, bool root_device_rw,
2475                 const char *home_device, bool home_device_rw,
2476                 const char *srv_device, bool srv_device_rw) {
2477         int r;
2478
2479         assert(where);
2480
2481         if (root_device) {
2482                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2483                 if (r < 0)
2484                         return log_error_errno(r, "Failed to mount root directory: %m");
2485         }
2486
2487         if (home_device) {
2488                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2489                 if (r < 0)
2490                         return log_error_errno(r, "Failed to mount home directory: %m");
2491         }
2492
2493         if (srv_device) {
2494                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2495                 if (r < 0)
2496                         return log_error_errno(r, "Failed to mount server data directory: %m");
2497         }
2498
2499         return 0;
2500 }
2501
2502 static void loop_remove(int nr, int *image_fd) {
2503         _cleanup_close_ int control = -1;
2504         int r;
2505
2506         if (nr < 0)
2507                 return;
2508
2509         if (image_fd && *image_fd >= 0) {
2510                 r = ioctl(*image_fd, LOOP_CLR_FD);
2511                 if (r < 0)
2512                         log_warning_errno(errno, "Failed to close loop image: %m");
2513                 *image_fd = safe_close(*image_fd);
2514         }
2515
2516         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2517         if (control < 0) {
2518                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2519                 return;
2520         }
2521
2522         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2523         if (r < 0)
2524                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2525 }
2526
2527 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2528         int pipe_fds[2];
2529         pid_t pid;
2530
2531         assert(database);
2532         assert(key);
2533         assert(rpid);
2534
2535         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2536                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2537
2538         pid = fork();
2539         if (pid < 0)
2540                 return log_error_errno(errno, "Failed to fork getent child: %m");
2541         else if (pid == 0) {
2542                 int nullfd;
2543                 char *empty_env = NULL;
2544
2545                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2546                         _exit(EXIT_FAILURE);
2547
2548                 if (pipe_fds[0] > 2)
2549                         safe_close(pipe_fds[0]);
2550                 if (pipe_fds[1] > 2)
2551                         safe_close(pipe_fds[1]);
2552
2553                 nullfd = open("/dev/null", O_RDWR);
2554                 if (nullfd < 0)
2555                         _exit(EXIT_FAILURE);
2556
2557                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2558                         _exit(EXIT_FAILURE);
2559
2560                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2561                         _exit(EXIT_FAILURE);
2562
2563                 if (nullfd > 2)
2564                         safe_close(nullfd);
2565
2566                 reset_all_signal_handlers();
2567                 close_all_fds(NULL, 0);
2568
2569                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2570                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2571                 _exit(EXIT_FAILURE);
2572         }
2573
2574         pipe_fds[1] = safe_close(pipe_fds[1]);
2575
2576         *rpid = pid;
2577
2578         return pipe_fds[0];
2579 }
2580
2581 static int change_uid_gid(char **_home) {
2582         char line[LINE_MAX], *x, *u, *g, *h;
2583         const char *word, *state;
2584         _cleanup_free_ uid_t *uids = NULL;
2585         _cleanup_free_ char *home = NULL;
2586         _cleanup_fclose_ FILE *f = NULL;
2587         _cleanup_close_ int fd = -1;
2588         unsigned n_uids = 0;
2589         size_t sz = 0, l;
2590         uid_t uid;
2591         gid_t gid;
2592         pid_t pid;
2593         int r;
2594
2595         assert(_home);
2596
2597         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2598                 /* Reset everything fully to 0, just in case */
2599
2600                 if (setgroups(0, NULL) < 0)
2601                         return log_error_errno(errno, "setgroups() failed: %m");
2602
2603                 if (setresgid(0, 0, 0) < 0)
2604                         return log_error_errno(errno, "setregid() failed: %m");
2605
2606                 if (setresuid(0, 0, 0) < 0)
2607                         return log_error_errno(errno, "setreuid() failed: %m");
2608
2609                 *_home = NULL;
2610                 return 0;
2611         }
2612
2613         /* First, get user credentials */
2614         fd = spawn_getent("passwd", arg_user, &pid);
2615         if (fd < 0)
2616                 return fd;
2617
2618         f = fdopen(fd, "r");
2619         if (!f)
2620                 return log_oom();
2621         fd = -1;
2622
2623         if (!fgets(line, sizeof(line), f)) {
2624
2625                 if (!ferror(f)) {
2626                         log_error("Failed to resolve user %s.", arg_user);
2627                         return -ESRCH;
2628                 }
2629
2630                 log_error_errno(errno, "Failed to read from getent: %m");
2631                 return -errno;
2632         }
2633
2634         truncate_nl(line);
2635
2636         wait_for_terminate_and_warn("getent passwd", pid, true);
2637
2638         x = strchr(line, ':');
2639         if (!x) {
2640                 log_error("/etc/passwd entry has invalid user field.");
2641                 return -EIO;
2642         }
2643
2644         u = strchr(x+1, ':');
2645         if (!u) {
2646                 log_error("/etc/passwd entry has invalid password field.");
2647                 return -EIO;
2648         }
2649
2650         u++;
2651         g = strchr(u, ':');
2652         if (!g) {
2653                 log_error("/etc/passwd entry has invalid UID field.");
2654                 return -EIO;
2655         }
2656
2657         *g = 0;
2658         g++;
2659         x = strchr(g, ':');
2660         if (!x) {
2661                 log_error("/etc/passwd entry has invalid GID field.");
2662                 return -EIO;
2663         }
2664
2665         *x = 0;
2666         h = strchr(x+1, ':');
2667         if (!h) {
2668                 log_error("/etc/passwd entry has invalid GECOS field.");
2669                 return -EIO;
2670         }
2671
2672         h++;
2673         x = strchr(h, ':');
2674         if (!x) {
2675                 log_error("/etc/passwd entry has invalid home directory field.");
2676                 return -EIO;
2677         }
2678
2679         *x = 0;
2680
2681         r = parse_uid(u, &uid);
2682         if (r < 0) {
2683                 log_error("Failed to parse UID of user.");
2684                 return -EIO;
2685         }
2686
2687         r = parse_gid(g, &gid);
2688         if (r < 0) {
2689                 log_error("Failed to parse GID of user.");
2690                 return -EIO;
2691         }
2692
2693         home = strdup(h);
2694         if (!home)
2695                 return log_oom();
2696
2697         /* Second, get group memberships */
2698         fd = spawn_getent("initgroups", arg_user, &pid);
2699         if (fd < 0)
2700                 return fd;
2701
2702         fclose(f);
2703         f = fdopen(fd, "r");
2704         if (!f)
2705                 return log_oom();
2706         fd = -1;
2707
2708         if (!fgets(line, sizeof(line), f)) {
2709                 if (!ferror(f)) {
2710                         log_error("Failed to resolve user %s.", arg_user);
2711                         return -ESRCH;
2712                 }
2713
2714                 log_error_errno(errno, "Failed to read from getent: %m");
2715                 return -errno;
2716         }
2717
2718         truncate_nl(line);
2719
2720         wait_for_terminate_and_warn("getent initgroups", pid, true);
2721
2722         /* Skip over the username and subsequent separator whitespace */
2723         x = line;
2724         x += strcspn(x, WHITESPACE);
2725         x += strspn(x, WHITESPACE);
2726
2727         FOREACH_WORD(word, l, x, state) {
2728                 char c[l+1];
2729
2730                 memcpy(c, word, l);
2731                 c[l] = 0;
2732
2733                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2734                         return log_oom();
2735
2736                 r = parse_uid(c, &uids[n_uids++]);
2737                 if (r < 0) {
2738                         log_error("Failed to parse group data from getent.");
2739                         return -EIO;
2740                 }
2741         }
2742
2743         r = mkdir_parents(home, 0775);
2744         if (r < 0)
2745                 return log_error_errno(r, "Failed to make home root directory: %m");
2746
2747         r = mkdir_safe(home, 0755, uid, gid);
2748         if (r < 0 && r != -EEXIST)
2749                 return log_error_errno(r, "Failed to make home directory: %m");
2750
2751         fchown(STDIN_FILENO, uid, gid);
2752         fchown(STDOUT_FILENO, uid, gid);
2753         fchown(STDERR_FILENO, uid, gid);
2754
2755         if (setgroups(n_uids, uids) < 0)
2756                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2757
2758         if (setresgid(gid, gid, gid) < 0)
2759                 return log_error_errno(errno, "setregid() failed: %m");
2760
2761         if (setresuid(uid, uid, uid) < 0)
2762                 return log_error_errno(errno, "setreuid() failed: %m");
2763
2764         if (_home) {
2765                 *_home = home;
2766                 home = NULL;
2767         }
2768
2769         return 0;
2770 }
2771
2772 /*
2773  * Return values:
2774  * < 0 : wait_for_terminate() failed to get the state of the
2775  *       container, the container was terminated by a signal, or
2776  *       failed for an unknown reason.  No change is made to the
2777  *       container argument.
2778  * > 0 : The program executed in the container terminated with an
2779  *       error.  The exit code of the program executed in the
2780  *       container is returned.  The container argument has been set
2781  *       to CONTAINER_TERMINATED.
2782  *   0 : The container is being rebooted, has been shut down or exited
2783  *       successfully.  The container argument has been set to either
2784  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2785  *
2786  * That is, success is indicated by a return value of zero, and an
2787  * error is indicated by a non-zero value.
2788  */
2789 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2790         siginfo_t status;
2791         int r;
2792
2793         r = wait_for_terminate(pid, &status);
2794         if (r < 0)
2795                 return log_warning_errno(r, "Failed to wait for container: %m");
2796
2797         switch (status.si_code) {
2798
2799         case CLD_EXITED:
2800                 if (status.si_status == 0) {
2801                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2802
2803                 } else
2804                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2805
2806                 *container = CONTAINER_TERMINATED;
2807                 return status.si_status;
2808
2809         case CLD_KILLED:
2810                 if (status.si_status == SIGINT) {
2811
2812                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2813                         *container = CONTAINER_TERMINATED;
2814                         return 0;
2815
2816                 } else if (status.si_status == SIGHUP) {
2817
2818                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2819                         *container = CONTAINER_REBOOTED;
2820                         return 0;
2821                 }
2822
2823                 /* CLD_KILLED fallthrough */
2824
2825         case CLD_DUMPED:
2826                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2827                 return -EIO;
2828
2829         default:
2830                 log_error("Container %s failed due to unknown reason.", arg_machine);
2831                 return -EIO;
2832         }
2833
2834         return r;
2835 }
2836
2837 static void nop_handler(int sig) {}
2838
2839 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2840         pid_t pid;
2841
2842         pid = PTR_TO_UINT32(userdata);
2843         if (pid > 0) {
2844                 if (kill(pid, SIGRTMIN+3) >= 0) {
2845                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2846                         sd_event_source_set_userdata(s, NULL);
2847                         return 0;
2848                 }
2849         }
2850
2851         sd_event_exit(sd_event_source_get_event(s), 0);
2852         return 0;
2853 }
2854
2855 static int determine_names(void) {
2856
2857         if (!arg_image && !arg_directory) {
2858                 if (arg_machine)
2859                         arg_directory = strappend("/var/lib/container/", arg_machine);
2860                 else
2861                         arg_directory = get_current_dir_name();
2862
2863                 if (!arg_directory) {
2864                         log_error("Failed to determine path, please use -D.");
2865                         return -EINVAL;
2866                 }
2867         }
2868
2869         if (!arg_machine) {
2870                 arg_machine = strdup(basename(arg_image ?: arg_directory));
2871                 if (!arg_machine)
2872                         return log_oom();
2873
2874                 hostname_cleanup(arg_machine, false);
2875                 if (!machine_name_is_valid(arg_machine)) {
2876                         log_error("Failed to determine machine name automatically, please use -M.");
2877                         return -EINVAL;
2878                 }
2879         }
2880
2881         return 0;
2882 }
2883
2884 int main(int argc, char *argv[]) {
2885
2886         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2887         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2888         _cleanup_close_ int master = -1, image_fd = -1;
2889         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2890         _cleanup_fdset_free_ FDSet *fds = NULL;
2891         int r, n_fd_passed, loop_nr = -1;
2892         const char *console = NULL;
2893         char veth_name[IFNAMSIZ];
2894         bool secondary = false, remove_subvol = false;
2895         sigset_t mask, mask_chld;
2896         pid_t pid = 0;
2897         int ret = EXIT_SUCCESS;
2898
2899         log_parse_environment();
2900         log_open();
2901
2902         r = parse_argv(argc, argv);
2903         if (r <= 0)
2904                 goto finish;
2905
2906         r = determine_names();
2907         if (r < 0)
2908                 goto finish;
2909
2910         if (geteuid() != 0) {
2911                 log_error("Need to be root.");
2912                 r = -EPERM;
2913                 goto finish;
2914         }
2915
2916         if (sd_booted() <= 0) {
2917                 log_error("Not running on a systemd system.");
2918                 r = -EINVAL;
2919                 goto finish;
2920         }
2921
2922         log_close();
2923         n_fd_passed = sd_listen_fds(false);
2924         if (n_fd_passed > 0) {
2925                 r = fdset_new_listen_fds(&fds, false);
2926                 if (r < 0) {
2927                         log_error_errno(r, "Failed to collect file descriptors: %m");
2928                         goto finish;
2929                 }
2930         }
2931         fdset_close_others(fds);
2932         log_open();
2933
2934         if (arg_directory) {
2935                 assert(!arg_image);
2936
2937                 if (path_equal(arg_directory, "/")) {
2938                         log_error("Spawning container on root directory not supported.");
2939                         r = -EINVAL;
2940                         goto finish;
2941                 }
2942
2943                 if (arg_template) {
2944                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
2945                         if (r == -EEXIST) {
2946                                 if (!arg_quiet)
2947                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
2948                         } else if (r < 0) {
2949                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
2950                                 goto finish;
2951                         } else {
2952                                 if (!arg_quiet)
2953                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
2954                         }
2955
2956                 } else if (arg_ephemeral) {
2957                         char *np;
2958
2959                         r = tempfn_random(arg_directory, &np);
2960                         if (r < 0) {
2961                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
2962                                 goto finish;
2963                         }
2964
2965                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
2966                         if (r < 0) {
2967                                 free(np);
2968                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
2969                                 goto finish;
2970                         }
2971
2972                         free(arg_directory);
2973                         arg_directory = np;
2974
2975                         remove_subvol = true;
2976                 }
2977
2978                 if (arg_boot) {
2979                         if (path_is_os_tree(arg_directory) <= 0) {
2980                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2981                                 r = -EINVAL;
2982                                 goto finish;
2983                         }
2984                 } else {
2985                         const char *p;
2986
2987                         p = strappenda(arg_directory,
2988                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2989                         if (access(p, F_OK) < 0) {
2990                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2991                                 r = -EINVAL;
2992                                 goto finish;
2993                         }
2994                 }
2995
2996         } else {
2997                 char template[] = "/tmp/nspawn-root-XXXXXX";
2998
2999                 assert(arg_image);
3000                 assert(!arg_template);
3001
3002                 if (!mkdtemp(template)) {
3003                         log_error_errno(errno, "Failed to create temporary directory: %m");
3004                         r = -errno;
3005                         goto finish;
3006                 }
3007
3008                 arg_directory = strdup(template);
3009                 if (!arg_directory) {
3010                         r = log_oom();
3011                         goto finish;
3012                 }
3013
3014                 image_fd = setup_image(&device_path, &loop_nr);
3015                 if (image_fd < 0) {
3016                         r = image_fd;
3017                         goto finish;
3018                 }
3019
3020                 r = dissect_image(image_fd,
3021                                   &root_device, &root_device_rw,
3022                                   &home_device, &home_device_rw,
3023                                   &srv_device, &srv_device_rw,
3024                                   &secondary);
3025                 if (r < 0)
3026                         goto finish;
3027         }
3028
3029         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3030         if (master < 0) {
3031                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3032                 goto finish;
3033         }
3034
3035         console = ptsname(master);
3036         if (!console) {
3037                 r = log_error_errno(errno, "Failed to determine tty name: %m");
3038                 goto finish;
3039         }
3040
3041         if (!arg_quiet)
3042                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3043                          arg_machine, arg_image ?: arg_directory);
3044
3045         if (unlockpt(master) < 0) {
3046                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3047                 goto finish;
3048         }
3049
3050         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3051                 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3052                 goto finish;
3053         }
3054
3055         sd_notify(false,
3056                   "READY=1\n"
3057                   "STATUS=Container running.");
3058
3059         assert_se(sigemptyset(&mask) == 0);
3060         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3061         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3062
3063         assert_se(sigemptyset(&mask_chld) == 0);
3064         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3065
3066         for (;;) {
3067                 ContainerStatus container_status;
3068                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3069                 struct sigaction sa = {
3070                         .sa_handler = nop_handler,
3071                         .sa_flags = SA_NOCLDSTOP,
3072                 };
3073
3074                 r = barrier_create(&barrier);
3075                 if (r < 0) {
3076                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3077                         goto finish;
3078                 }
3079
3080                 /* Child can be killed before execv(), so handle SIGCHLD
3081                  * in order to interrupt parent's blocking calls and
3082                  * give it a chance to call wait() and terminate. */
3083                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3084                 if (r < 0) {
3085                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3086                         goto finish;
3087                 }
3088
3089                 r = sigaction(SIGCHLD, &sa, NULL);
3090                 if (r < 0) {
3091                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3092                         goto finish;
3093                 }
3094
3095                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3096                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3097                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3098                 if (pid < 0) {
3099                         if (errno == EINVAL)
3100                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3101                         else
3102                                 r = log_error_errno(errno, "clone() failed: %m");
3103
3104                         goto finish;
3105                 }
3106
3107                 if (pid == 0) {
3108                         /* child */
3109                         _cleanup_free_ char *home = NULL;
3110                         unsigned n_env = 2;
3111                         const char *envp[] = {
3112                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3113                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3114                                 NULL, /* TERM */
3115                                 NULL, /* HOME */
3116                                 NULL, /* USER */
3117                                 NULL, /* LOGNAME */
3118                                 NULL, /* container_uuid */
3119                                 NULL, /* LISTEN_FDS */
3120                                 NULL, /* LISTEN_PID */
3121                                 NULL
3122                         };
3123                         char **env_use;
3124
3125                         barrier_set_role(&barrier, BARRIER_CHILD);
3126
3127                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3128                         if (envp[n_env])
3129                                 n_env ++;
3130
3131                         master = safe_close(master);
3132
3133                         close_nointr(STDIN_FILENO);
3134                         close_nointr(STDOUT_FILENO);
3135                         close_nointr(STDERR_FILENO);
3136
3137                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3138
3139                         reset_all_signal_handlers();
3140                         reset_signal_mask();
3141
3142                         r = open_terminal(console, O_RDWR);
3143                         if (r != STDIN_FILENO) {
3144                                 if (r >= 0) {
3145                                         safe_close(r);
3146                                         r = -EINVAL;
3147                                 }
3148
3149                                 log_error_errno(r, "Failed to open console: %m");
3150                                 _exit(EXIT_FAILURE);
3151                         }
3152
3153                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3154                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3155                                 log_error_errno(errno, "Failed to duplicate console: %m");
3156                                 _exit(EXIT_FAILURE);
3157                         }
3158
3159                         if (setsid() < 0) {
3160                                 log_error_errno(errno, "setsid() failed: %m");
3161                                 _exit(EXIT_FAILURE);
3162                         }
3163
3164                         if (reset_audit_loginuid() < 0)
3165                                 _exit(EXIT_FAILURE);
3166
3167                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3168                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3169                                 _exit(EXIT_FAILURE);
3170                         }
3171
3172                         /* Mark everything as slave, so that we still
3173                          * receive mounts from the real root, but don't
3174                          * propagate mounts to the real root. */
3175                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3176                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3177                                 _exit(EXIT_FAILURE);
3178                         }
3179
3180                         if (mount_devices(arg_directory,
3181                                           root_device, root_device_rw,
3182                                           home_device, home_device_rw,
3183                                           srv_device, srv_device_rw) < 0)
3184                                 _exit(EXIT_FAILURE);
3185
3186                         /* Turn directory into bind mount */
3187                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3188                                 log_error_errno(errno, "Failed to make bind mount: %m");
3189                                 _exit(EXIT_FAILURE);
3190                         }
3191
3192                         r = setup_volatile(arg_directory);
3193                         if (r < 0)
3194                                 _exit(EXIT_FAILURE);
3195
3196                         if (setup_volatile_state(arg_directory) < 0)
3197                                 _exit(EXIT_FAILURE);
3198
3199                         r = base_filesystem_create(arg_directory);
3200                         if (r < 0)
3201                                 _exit(EXIT_FAILURE);
3202
3203                         if (arg_read_only) {
3204                                 r = bind_remount_recursive(arg_directory, true);
3205                                 if (r < 0) {
3206                                         log_error_errno(r, "Failed to make tree read-only: %m");
3207                                         _exit(EXIT_FAILURE);
3208                                 }
3209                         }
3210
3211                         if (mount_all(arg_directory) < 0)
3212                                 _exit(EXIT_FAILURE);
3213
3214                         if (copy_devnodes(arg_directory) < 0)
3215                                 _exit(EXIT_FAILURE);
3216
3217                         if (setup_ptmx(arg_directory) < 0)
3218                                 _exit(EXIT_FAILURE);
3219
3220                         dev_setup(arg_directory);
3221
3222                         if (setup_seccomp() < 0)
3223                                 _exit(EXIT_FAILURE);
3224
3225                         if (setup_dev_console(arg_directory, console) < 0)
3226                                 _exit(EXIT_FAILURE);
3227
3228                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3229                                 _exit(EXIT_FAILURE);
3230
3231                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3232
3233                         if (setup_boot_id(arg_directory) < 0)
3234                                 _exit(EXIT_FAILURE);
3235
3236                         if (setup_timezone(arg_directory) < 0)
3237                                 _exit(EXIT_FAILURE);
3238
3239                         if (setup_resolv_conf(arg_directory) < 0)
3240                                 _exit(EXIT_FAILURE);
3241
3242                         if (setup_journal(arg_directory) < 0)
3243                                 _exit(EXIT_FAILURE);
3244
3245                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3246                                 _exit(EXIT_FAILURE);
3247
3248                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3249                                 _exit(EXIT_FAILURE);
3250
3251                         if (mount_tmpfs(arg_directory) < 0)
3252                                 _exit(EXIT_FAILURE);
3253
3254                         /* Tell the parent that we are ready, and that
3255                          * it can cgroupify us to that we lack access
3256                          * to certain devices and resources. */
3257                         (void)barrier_place(&barrier);
3258
3259                         if (chdir(arg_directory) < 0) {
3260                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3261                                 _exit(EXIT_FAILURE);
3262                         }
3263
3264                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3265                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3266                                 _exit(EXIT_FAILURE);
3267                         }
3268
3269                         if (chroot(".") < 0) {
3270                                 log_error_errno(errno, "chroot() failed: %m");
3271                                 _exit(EXIT_FAILURE);
3272                         }
3273
3274                         if (chdir("/") < 0) {
3275                                 log_error_errno(errno, "chdir() failed: %m");
3276                                 _exit(EXIT_FAILURE);
3277                         }
3278
3279                         umask(0022);
3280
3281                         if (arg_private_network)
3282                                 loopback_setup();
3283
3284                         if (drop_capabilities() < 0) {
3285                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3286                                 _exit(EXIT_FAILURE);
3287                         }
3288
3289                         r = change_uid_gid(&home);
3290                         if (r < 0)
3291                                 _exit(EXIT_FAILURE);
3292
3293                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3294                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3295                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3296                                 log_oom();
3297                                 _exit(EXIT_FAILURE);
3298                         }
3299
3300                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3301                                 char as_uuid[37];
3302
3303                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3304                                         log_oom();
3305                                         _exit(EXIT_FAILURE);
3306                                 }
3307                         }
3308
3309                         if (fdset_size(fds) > 0) {
3310                                 r = fdset_cloexec(fds, false);
3311                                 if (r < 0) {
3312                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3313                                         _exit(EXIT_FAILURE);
3314                                 }
3315
3316                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3317                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3318                                         log_oom();
3319                                         _exit(EXIT_FAILURE);
3320                                 }
3321                         }
3322
3323                         setup_hostname();
3324
3325                         if (arg_personality != 0xffffffffLU) {
3326                                 if (personality(arg_personality) < 0) {
3327                                         log_error_errno(errno, "personality() failed: %m");
3328                                         _exit(EXIT_FAILURE);
3329                                 }
3330                         } else if (secondary) {
3331                                 if (personality(PER_LINUX32) < 0) {
3332                                         log_error_errno(errno, "personality() failed: %m");
3333                                         _exit(EXIT_FAILURE);
3334                                 }
3335                         }
3336
3337 #ifdef HAVE_SELINUX
3338                         if (arg_selinux_context)
3339                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3340                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3341                                         _exit(EXIT_FAILURE);
3342                                 }
3343 #endif
3344
3345                         if (!strv_isempty(arg_setenv)) {
3346                                 char **n;
3347
3348                                 n = strv_env_merge(2, envp, arg_setenv);
3349                                 if (!n) {
3350                                         log_oom();
3351                                         _exit(EXIT_FAILURE);
3352                                 }
3353
3354                                 env_use = n;
3355                         } else
3356                                 env_use = (char**) envp;
3357
3358                         /* Wait until the parent is ready with the setup, too... */
3359                         if (!barrier_place_and_sync(&barrier))
3360                                 _exit(EXIT_FAILURE);
3361
3362                         if (arg_boot) {
3363                                 char **a;
3364                                 size_t l;
3365
3366                                 /* Automatically search for the init system */
3367
3368                                 l = 1 + argc - optind;
3369                                 a = newa(char*, l + 1);
3370                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3371
3372                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3373                                 execve(a[0], a, env_use);
3374
3375                                 a[0] = (char*) "/lib/systemd/systemd";
3376                                 execve(a[0], a, env_use);
3377
3378                                 a[0] = (char*) "/sbin/init";
3379                                 execve(a[0], a, env_use);
3380                         } else if (argc > optind)
3381                                 execvpe(argv[optind], argv + optind, env_use);
3382                         else {
3383                                 chdir(home ? home : "/root");
3384                                 execle("/bin/bash", "-bash", NULL, env_use);
3385                                 execle("/bin/sh", "-sh", NULL, env_use);
3386                         }
3387
3388                         log_error_errno(errno, "execv() failed: %m");
3389                         _exit(EXIT_FAILURE);
3390                 }
3391
3392                 barrier_set_role(&barrier, BARRIER_PARENT);
3393                 fdset_free(fds);
3394                 fds = NULL;
3395
3396                 /* wait for child-setup to be done */
3397                 if (barrier_place_and_sync(&barrier)) {
3398                         _cleanup_event_unref_ sd_event *event = NULL;
3399                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3400                         int ifi = 0;
3401
3402                         r = move_network_interfaces(pid);
3403                         if (r < 0)
3404                                 goto finish;
3405
3406                         r = setup_veth(pid, veth_name, &ifi);
3407                         if (r < 0)
3408                                 goto finish;
3409
3410                         r = setup_bridge(veth_name, &ifi);
3411                         if (r < 0)
3412                                 goto finish;
3413
3414                         r = setup_macvlan(pid);
3415                         if (r < 0)
3416                                 goto finish;
3417
3418                         r = register_machine(pid, ifi);
3419                         if (r < 0)
3420                                 goto finish;
3421
3422                         /* Block SIGCHLD here, before notifying child.
3423                          * process_pty() will handle it with the other signals. */
3424                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3425                         if (r < 0)
3426                                 goto finish;
3427
3428                         /* Reset signal to default */
3429                         r = default_signals(SIGCHLD, -1);
3430                         if (r < 0)
3431                                 goto finish;
3432
3433                         /* Notify the child that the parent is ready with all
3434                          * its setup, and that the child can now hand over
3435                          * control to the code to run inside the container. */
3436                         (void)barrier_place(&barrier);
3437
3438                         r = sd_event_new(&event);
3439                         if (r < 0) {
3440                                 log_error_errno(r, "Failed to get default event source: %m");
3441                                 goto finish;
3442                         }
3443
3444                         if (arg_boot) {
3445                                 /* Try to kill the init system on SIGINT or SIGTERM */
3446                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3447                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3448                         } else {
3449                                 /* Immediately exit */
3450                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3451                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3452                         }
3453
3454                         /* simply exit on sigchld */
3455                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3456
3457                         r = pty_forward_new(event, master, &forward);
3458                         if (r < 0) {
3459                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3460                                 goto finish;
3461                         }
3462
3463                         r = sd_event_loop(event);
3464                         if (r < 0) {
3465                                 log_error_errno(r, "Failed to run event loop: %m");
3466                                 goto finish;
3467                         }
3468
3469                         forward = pty_forward_free(forward);
3470
3471                         if (!arg_quiet)
3472                                 putc('\n', stdout);
3473
3474                         /* Kill if it is not dead yet anyway */
3475                         terminate_machine(pid);
3476                 }
3477
3478                 /* Normally redundant, but better safe than sorry */
3479                 kill(pid, SIGKILL);
3480
3481                 r = wait_for_container(pid, &container_status);
3482                 pid = 0;
3483
3484                 if (r < 0)
3485                         /* We failed to wait for the container, or the
3486                          * container exited abnormally */
3487                         goto finish;
3488                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3489                         /* The container exited with a non-zero
3490                          * status, or with zero status and no reboot
3491                          * was requested. */
3492                         ret = r;
3493                         break;
3494                 }
3495
3496                 /* CONTAINER_REBOOTED, loop again */
3497
3498                 if (arg_keep_unit) {
3499                         /* Special handling if we are running as a
3500                          * service: instead of simply restarting the
3501                          * machine we want to restart the entire
3502                          * service, so let's inform systemd about this
3503                          * with the special exit code 133. The service
3504                          * file uses RestartForceExitStatus=133 so
3505                          * that this results in a full nspawn
3506                          * restart. This is necessary since we might
3507                          * have cgroup parameters set we want to have
3508                          * flushed out. */
3509                         ret = 133;
3510                         r = 0;
3511                         break;
3512                 }
3513         }
3514
3515 finish:
3516         sd_notify(false,
3517                   "STOPPING=1\n"
3518                   "STATUS=Terminating...");
3519
3520         loop_remove(loop_nr, &image_fd);
3521
3522         if (pid > 0)
3523                 kill(pid, SIGKILL);
3524
3525         if (remove_subvol && arg_directory) {
3526                 int k;
3527
3528                 k = btrfs_subvol_remove(arg_directory);
3529                 if (k < 0)
3530                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3531         }
3532
3533         free(arg_directory);
3534         free(arg_template);
3535         free(arg_image);
3536         free(arg_machine);
3537         free(arg_user);
3538         strv_free(arg_setenv);
3539         strv_free(arg_network_interfaces);
3540         strv_free(arg_network_macvlan);
3541         strv_free(arg_bind);
3542         strv_free(arg_bind_ro);
3543         strv_free(arg_tmpfs);
3544
3545         return r < 0 ? EXIT_FAILURE : ret;
3546 }