chiark / gitweb /
nspawn: mount most of the cgroup tree read-only in nspawn containers except for the...
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <getopt.h>
35 #include <termios.h>
36 #include <sys/signalfd.h>
37 #include <grp.h>
38 #include <linux/fs.h>
39 #include <sys/un.h>
40 #include <sys/socket.h>
41 #include <linux/netlink.h>
42 #include <net/if.h>
43 #include <linux/veth.h>
44 #include <sys/personality.h>
45 #include <linux/loop.h>
46
47 #ifdef HAVE_SELINUX
48 #include <selinux/selinux.h>
49 #endif
50
51 #ifdef HAVE_SECCOMP
52 #include <seccomp.h>
53 #endif
54
55 #ifdef HAVE_BLKID
56 #include <blkid/blkid.h>
57 #endif
58
59 #include "sd-daemon.h"
60 #include "sd-bus.h"
61 #include "sd-id128.h"
62 #include "sd-rtnl.h"
63 #include "log.h"
64 #include "util.h"
65 #include "mkdir.h"
66 #include "macro.h"
67 #include "audit.h"
68 #include "missing.h"
69 #include "cgroup-util.h"
70 #include "strv.h"
71 #include "path-util.h"
72 #include "loopback-setup.h"
73 #include "dev-setup.h"
74 #include "fdset.h"
75 #include "build.h"
76 #include "fileio.h"
77 #include "bus-util.h"
78 #include "bus-error.h"
79 #include "ptyfwd.h"
80 #include "bus-kernel.h"
81 #include "env-util.h"
82 #include "def.h"
83 #include "rtnl-util.h"
84 #include "udev-util.h"
85 #include "blkid-util.h"
86 #include "gpt.h"
87 #include "siphash24.h"
88 #include "copy.h"
89 #include "base-filesystem.h"
90 #include "barrier.h"
91 #include "event-util.h"
92 #include "capability.h"
93 #include "cap-list.h"
94 #include "btrfs-util.h"
95 #include "machine-image.h"
96
97 #ifdef HAVE_SECCOMP
98 #include "seccomp-util.h"
99 #endif
100
101 typedef enum ContainerStatus {
102         CONTAINER_TERMINATED,
103         CONTAINER_REBOOTED
104 } ContainerStatus;
105
106 typedef enum LinkJournal {
107         LINK_NO,
108         LINK_AUTO,
109         LINK_HOST,
110         LINK_GUEST
111 } LinkJournal;
112
113 typedef enum Volatile {
114         VOLATILE_NO,
115         VOLATILE_YES,
116         VOLATILE_STATE,
117 } Volatile;
118
119 static char *arg_directory = NULL;
120 static char *arg_template = NULL;
121 static char *arg_user = NULL;
122 static sd_id128_t arg_uuid = {};
123 static char *arg_machine = NULL;
124 static const char *arg_selinux_context = NULL;
125 static const char *arg_selinux_apifs_context = NULL;
126 static const char *arg_slice = NULL;
127 static bool arg_private_network = false;
128 static bool arg_read_only = false;
129 static bool arg_boot = false;
130 static bool arg_ephemeral = false;
131 static LinkJournal arg_link_journal = LINK_AUTO;
132 static bool arg_link_journal_try = false;
133 static uint64_t arg_retain =
134         (1ULL << CAP_CHOWN) |
135         (1ULL << CAP_DAC_OVERRIDE) |
136         (1ULL << CAP_DAC_READ_SEARCH) |
137         (1ULL << CAP_FOWNER) |
138         (1ULL << CAP_FSETID) |
139         (1ULL << CAP_IPC_OWNER) |
140         (1ULL << CAP_KILL) |
141         (1ULL << CAP_LEASE) |
142         (1ULL << CAP_LINUX_IMMUTABLE) |
143         (1ULL << CAP_NET_BIND_SERVICE) |
144         (1ULL << CAP_NET_BROADCAST) |
145         (1ULL << CAP_NET_RAW) |
146         (1ULL << CAP_SETGID) |
147         (1ULL << CAP_SETFCAP) |
148         (1ULL << CAP_SETPCAP) |
149         (1ULL << CAP_SETUID) |
150         (1ULL << CAP_SYS_ADMIN) |
151         (1ULL << CAP_SYS_CHROOT) |
152         (1ULL << CAP_SYS_NICE) |
153         (1ULL << CAP_SYS_PTRACE) |
154         (1ULL << CAP_SYS_TTY_CONFIG) |
155         (1ULL << CAP_SYS_RESOURCE) |
156         (1ULL << CAP_SYS_BOOT) |
157         (1ULL << CAP_AUDIT_WRITE) |
158         (1ULL << CAP_AUDIT_CONTROL) |
159         (1ULL << CAP_MKNOD);
160 static char **arg_bind = NULL;
161 static char **arg_bind_ro = NULL;
162 static char **arg_tmpfs = NULL;
163 static char **arg_setenv = NULL;
164 static bool arg_quiet = false;
165 static bool arg_share_system = false;
166 static bool arg_register = true;
167 static bool arg_keep_unit = false;
168 static char **arg_network_interfaces = NULL;
169 static char **arg_network_macvlan = NULL;
170 static bool arg_network_veth = false;
171 static const char *arg_network_bridge = NULL;
172 static unsigned long arg_personality = 0xffffffffLU;
173 static char *arg_image = NULL;
174 static Volatile arg_volatile = VOLATILE_NO;
175
176 static void help(void) {
177         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
178                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
179                "  -h --help                 Show this help\n"
180                "     --version              Print version string\n"
181                "  -q --quiet                Do not show status information\n"
182                "  -D --directory=PATH       Root directory for the container\n"
183                "     --template=PATH        Initialize root directory from template directory,\n"
184                "                            if missing\n"
185                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
186                "                            remove it after exit\n"
187                "  -i --image=PATH           File system device or disk image for the container\n"
188                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
189                "  -u --user=USER            Run the command under specified user or uid\n"
190                "  -M --machine=NAME         Set the machine name for the container\n"
191                "     --uuid=UUID            Set a specific machine UUID for the container\n"
192                "  -S --slice=SLICE          Place the container in the specified slice\n"
193                "     --private-network      Disable network in container\n"
194                "     --network-interface=INTERFACE\n"
195                "                            Assign an existing network interface to the\n"
196                "                            container\n"
197                "     --network-macvlan=INTERFACE\n"
198                "                            Create a macvlan network interface based on an\n"
199                "                            existing network interface to the container\n"
200                "     --network-veth         Add a virtual ethernet connection between host\n"
201                "                            and container\n"
202                "     --network-bridge=INTERFACE\n"
203                "                            Add a virtual ethernet connection between host\n"
204                "                            and container and add it to an existing bridge on\n"
205                "                            the host\n"
206                "  -Z --selinux-context=SECLABEL\n"
207                "                            Set the SELinux security context to be used by\n"
208                "                            processes in the container\n"
209                "  -L --selinux-apifs-context=SECLABEL\n"
210                "                            Set the SELinux security context to be used by\n"
211                "                            API/tmpfs file systems in the container\n"
212                "     --capability=CAP       In addition to the default, retain specified\n"
213                "                            capability\n"
214                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
215                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
216                "                            try-guest, try-host\n"
217                "  -j                        Equivalent to --link-journal=try-guest\n"
218                "     --read-only            Mount the root directory read-only\n"
219                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
220                "                            the container\n"
221                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
222                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
223                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
224                "     --share-system         Share system namespaces with host\n"
225                "     --register=BOOLEAN     Register container as machine\n"
226                "     --keep-unit            Do not register a scope for the machine, reuse\n"
227                "                            the service unit nspawn is running in\n"
228                "     --volatile[=MODE]      Run the system in volatile mode\n",
229                program_invocation_short_name);
230 }
231
232 static int set_sanitized_path(char **b, const char *path) {
233         char *p;
234
235         assert(b);
236         assert(path);
237
238         p = canonicalize_file_name(path);
239         if (!p) {
240                 if (errno != ENOENT)
241                         return -errno;
242
243                 p = path_make_absolute_cwd(path);
244                 if (!p)
245                         return -ENOMEM;
246         }
247
248         free(*b);
249         *b = path_kill_slashes(p);
250         return 0;
251 }
252
253 static int parse_argv(int argc, char *argv[]) {
254
255         enum {
256                 ARG_VERSION = 0x100,
257                 ARG_PRIVATE_NETWORK,
258                 ARG_UUID,
259                 ARG_READ_ONLY,
260                 ARG_CAPABILITY,
261                 ARG_DROP_CAPABILITY,
262                 ARG_LINK_JOURNAL,
263                 ARG_BIND,
264                 ARG_BIND_RO,
265                 ARG_TMPFS,
266                 ARG_SETENV,
267                 ARG_SHARE_SYSTEM,
268                 ARG_REGISTER,
269                 ARG_KEEP_UNIT,
270                 ARG_NETWORK_INTERFACE,
271                 ARG_NETWORK_MACVLAN,
272                 ARG_NETWORK_VETH,
273                 ARG_NETWORK_BRIDGE,
274                 ARG_PERSONALITY,
275                 ARG_VOLATILE,
276                 ARG_TEMPLATE,
277         };
278
279         static const struct option options[] = {
280                 { "help",                  no_argument,       NULL, 'h'                   },
281                 { "version",               no_argument,       NULL, ARG_VERSION           },
282                 { "directory",             required_argument, NULL, 'D'                   },
283                 { "template",              required_argument, NULL, ARG_TEMPLATE          },
284                 { "ephemeral",             no_argument,       NULL, 'x'                   },
285                 { "user",                  required_argument, NULL, 'u'                   },
286                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
287                 { "boot",                  no_argument,       NULL, 'b'                   },
288                 { "uuid",                  required_argument, NULL, ARG_UUID              },
289                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
290                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
291                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
292                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
293                 { "bind",                  required_argument, NULL, ARG_BIND              },
294                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
295                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
296                 { "machine",               required_argument, NULL, 'M'                   },
297                 { "slice",                 required_argument, NULL, 'S'                   },
298                 { "setenv",                required_argument, NULL, ARG_SETENV            },
299                 { "selinux-context",       required_argument, NULL, 'Z'                   },
300                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
301                 { "quiet",                 no_argument,       NULL, 'q'                   },
302                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
303                 { "register",              required_argument, NULL, ARG_REGISTER          },
304                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
305                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
306                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
307                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
308                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
309                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
310                 { "image",                 required_argument, NULL, 'i'                   },
311                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
312                 {}
313         };
314
315         int c, r;
316         uint64_t plus = 0, minus = 0;
317
318         assert(argc >= 0);
319         assert(argv);
320
321         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:x", options, NULL)) >= 0)
322
323                 switch (c) {
324
325                 case 'h':
326                         help();
327                         return 0;
328
329                 case ARG_VERSION:
330                         puts(PACKAGE_STRING);
331                         puts(SYSTEMD_FEATURES);
332                         return 0;
333
334                 case 'D':
335                         r = set_sanitized_path(&arg_directory, optarg);
336                         if (r < 0)
337                                 return log_error_errno(r, "Invalid root directory: %m");
338
339                         break;
340
341                 case ARG_TEMPLATE:
342                         r = set_sanitized_path(&arg_template, optarg);
343                         if (r < 0)
344                                 return log_error_errno(r, "Invalid template directory: %m");
345
346                         break;
347
348                 case 'i':
349                         r = set_sanitized_path(&arg_image, optarg);
350                         if (r < 0)
351                                 return log_error_errno(r, "Invalid image path: %m");
352
353                         break;
354
355                 case 'x':
356                         arg_ephemeral = true;
357                         break;
358
359                 case 'u':
360                         free(arg_user);
361                         arg_user = strdup(optarg);
362                         if (!arg_user)
363                                 return log_oom();
364
365                         break;
366
367                 case ARG_NETWORK_BRIDGE:
368                         arg_network_bridge = optarg;
369
370                         /* fall through */
371
372                 case ARG_NETWORK_VETH:
373                         arg_network_veth = true;
374                         arg_private_network = true;
375                         break;
376
377                 case ARG_NETWORK_INTERFACE:
378                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
379                                 return log_oom();
380
381                         arg_private_network = true;
382                         break;
383
384                 case ARG_NETWORK_MACVLAN:
385                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
386                                 return log_oom();
387
388                         /* fall through */
389
390                 case ARG_PRIVATE_NETWORK:
391                         arg_private_network = true;
392                         break;
393
394                 case 'b':
395                         arg_boot = true;
396                         break;
397
398                 case ARG_UUID:
399                         r = sd_id128_from_string(optarg, &arg_uuid);
400                         if (r < 0) {
401                                 log_error("Invalid UUID: %s", optarg);
402                                 return r;
403                         }
404                         break;
405
406                 case 'S':
407                         arg_slice = optarg;
408                         break;
409
410                 case 'M':
411                         if (isempty(optarg)) {
412                                 free(arg_machine);
413                                 arg_machine = NULL;
414                         } else {
415                                 if (!machine_name_is_valid(optarg)) {
416                                         log_error("Invalid machine name: %s", optarg);
417                                         return -EINVAL;
418                                 }
419
420                                 r = free_and_strdup(&arg_machine, optarg);
421                                 if (r < 0)
422                                         return log_oom();
423
424                                 break;
425                         }
426
427                 case 'Z':
428                         arg_selinux_context = optarg;
429                         break;
430
431                 case 'L':
432                         arg_selinux_apifs_context = optarg;
433                         break;
434
435                 case ARG_READ_ONLY:
436                         arg_read_only = true;
437                         break;
438
439                 case ARG_CAPABILITY:
440                 case ARG_DROP_CAPABILITY: {
441                         const char *state, *word;
442                         size_t length;
443
444                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
445                                 _cleanup_free_ char *t;
446
447                                 t = strndup(word, length);
448                                 if (!t)
449                                         return log_oom();
450
451                                 if (streq(t, "all")) {
452                                         if (c == ARG_CAPABILITY)
453                                                 plus = (uint64_t) -1;
454                                         else
455                                                 minus = (uint64_t) -1;
456                                 } else {
457                                         int cap;
458
459                                         cap = capability_from_name(t);
460                                         if (cap < 0) {
461                                                 log_error("Failed to parse capability %s.", t);
462                                                 return -EINVAL;
463                                         }
464
465                                         if (c == ARG_CAPABILITY)
466                                                 plus |= 1ULL << (uint64_t) cap;
467                                         else
468                                                 minus |= 1ULL << (uint64_t) cap;
469                                 }
470                         }
471
472                         break;
473                 }
474
475                 case 'j':
476                         arg_link_journal = LINK_GUEST;
477                         arg_link_journal_try = true;
478                         break;
479
480                 case ARG_LINK_JOURNAL:
481                         if (streq(optarg, "auto")) {
482                                 arg_link_journal = LINK_AUTO;
483                                 arg_link_journal_try = false;
484                         } else if (streq(optarg, "no")) {
485                                 arg_link_journal = LINK_NO;
486                                 arg_link_journal_try = false;
487                         } else if (streq(optarg, "guest")) {
488                                 arg_link_journal = LINK_GUEST;
489                                 arg_link_journal_try = false;
490                         } else if (streq(optarg, "host")) {
491                                 arg_link_journal = LINK_HOST;
492                                 arg_link_journal_try = false;
493                         } else if (streq(optarg, "try-guest")) {
494                                 arg_link_journal = LINK_GUEST;
495                                 arg_link_journal_try = true;
496                         } else if (streq(optarg, "try-host")) {
497                                 arg_link_journal = LINK_HOST;
498                                 arg_link_journal_try = true;
499                         } else {
500                                 log_error("Failed to parse link journal mode %s", optarg);
501                                 return -EINVAL;
502                         }
503
504                         break;
505
506                 case ARG_BIND:
507                 case ARG_BIND_RO: {
508                         _cleanup_free_ char *a = NULL, *b = NULL;
509                         char *e;
510                         char ***x;
511
512                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
513
514                         e = strchr(optarg, ':');
515                         if (e) {
516                                 a = strndup(optarg, e - optarg);
517                                 b = strdup(e + 1);
518                         } else {
519                                 a = strdup(optarg);
520                                 b = strdup(optarg);
521                         }
522
523                         if (!a || !b)
524                                 return log_oom();
525
526                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
527                                 log_error("Invalid bind mount specification: %s", optarg);
528                                 return -EINVAL;
529                         }
530
531                         r = strv_extend(x, a);
532                         if (r < 0)
533                                 return log_oom();
534
535                         r = strv_extend(x, b);
536                         if (r < 0)
537                                 return log_oom();
538
539                         break;
540                 }
541
542                 case ARG_TMPFS: {
543                         _cleanup_free_ char *a = NULL, *b = NULL;
544                         char *e;
545
546                         e = strchr(optarg, ':');
547                         if (e) {
548                                 a = strndup(optarg, e - optarg);
549                                 b = strdup(e + 1);
550                         } else {
551                                 a = strdup(optarg);
552                                 b = strdup("mode=0755");
553                         }
554
555                         if (!a || !b)
556                                 return log_oom();
557
558                         if (!path_is_absolute(a)) {
559                                 log_error("Invalid tmpfs specification: %s", optarg);
560                                 return -EINVAL;
561                         }
562
563                         r = strv_push(&arg_tmpfs, a);
564                         if (r < 0)
565                                 return log_oom();
566
567                         a = NULL;
568
569                         r = strv_push(&arg_tmpfs, b);
570                         if (r < 0)
571                                 return log_oom();
572
573                         b = NULL;
574
575                         break;
576                 }
577
578                 case ARG_SETENV: {
579                         char **n;
580
581                         if (!env_assignment_is_valid(optarg)) {
582                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
583                                 return -EINVAL;
584                         }
585
586                         n = strv_env_set(arg_setenv, optarg);
587                         if (!n)
588                                 return log_oom();
589
590                         strv_free(arg_setenv);
591                         arg_setenv = n;
592                         break;
593                 }
594
595                 case 'q':
596                         arg_quiet = true;
597                         break;
598
599                 case ARG_SHARE_SYSTEM:
600                         arg_share_system = true;
601                         break;
602
603                 case ARG_REGISTER:
604                         r = parse_boolean(optarg);
605                         if (r < 0) {
606                                 log_error("Failed to parse --register= argument: %s", optarg);
607                                 return r;
608                         }
609
610                         arg_register = r;
611                         break;
612
613                 case ARG_KEEP_UNIT:
614                         arg_keep_unit = true;
615                         break;
616
617                 case ARG_PERSONALITY:
618
619                         arg_personality = personality_from_string(optarg);
620                         if (arg_personality == 0xffffffffLU) {
621                                 log_error("Unknown or unsupported personality '%s'.", optarg);
622                                 return -EINVAL;
623                         }
624
625                         break;
626
627                 case ARG_VOLATILE:
628
629                         if (!optarg)
630                                 arg_volatile = VOLATILE_YES;
631                         else {
632                                 r = parse_boolean(optarg);
633                                 if (r < 0) {
634                                         if (streq(optarg, "state"))
635                                                 arg_volatile = VOLATILE_STATE;
636                                         else {
637                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
638                                                 return r;
639                                         }
640                                 } else
641                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
642                         }
643
644                         break;
645
646                 case '?':
647                         return -EINVAL;
648
649                 default:
650                         assert_not_reached("Unhandled option");
651                 }
652
653         if (arg_share_system)
654                 arg_register = false;
655
656         if (arg_boot && arg_share_system) {
657                 log_error("--boot and --share-system may not be combined.");
658                 return -EINVAL;
659         }
660
661         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
662                 log_error("--keep-unit may not be used when invoked from a user session.");
663                 return -EINVAL;
664         }
665
666         if (arg_directory && arg_image) {
667                 log_error("--directory= and --image= may not be combined.");
668                 return -EINVAL;
669         }
670
671         if (arg_template && arg_image) {
672                 log_error("--template= and --image= may not be combined.");
673                 return -EINVAL;
674         }
675
676         if (arg_template && !(arg_directory || arg_machine)) {
677                 log_error("--template= needs --directory= or --machine=.");
678                 return -EINVAL;
679         }
680
681         if (arg_ephemeral && arg_template) {
682                 log_error("--ephemeral and --template= may not be combined.");
683                 return -EINVAL;
684         }
685
686         if (arg_ephemeral && arg_image) {
687                 log_error("--ephemeral and --image= may not be combined.");
688                 return -EINVAL;
689         }
690
691         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO)) {
692                 log_error("--ephemeral and --link-journal= may not be combined.");
693                 return -EINVAL;
694         }
695
696         if (arg_volatile != VOLATILE_NO && arg_read_only) {
697                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
698                 return -EINVAL;
699         }
700
701         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
702
703         return 1;
704 }
705
706 static int mount_all(const char *dest) {
707
708         typedef struct MountPoint {
709                 const char *what;
710                 const char *where;
711                 const char *type;
712                 const char *options;
713                 unsigned long flags;
714                 bool fatal;
715         } MountPoint;
716
717         static const MountPoint mount_table[] = {
718                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
719                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
720                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
721                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
722                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
723                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
724                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
725                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
726 #ifdef HAVE_SELINUX
727                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
728                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
729 #endif
730         };
731
732         unsigned k;
733         int r = 0;
734
735         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
736                 _cleanup_free_ char *where = NULL;
737 #ifdef HAVE_SELINUX
738                 _cleanup_free_ char *options = NULL;
739 #endif
740                 const char *o;
741                 int t;
742
743                 where = strjoin(dest, "/", mount_table[k].where, NULL);
744                 if (!where)
745                         return log_oom();
746
747                 t = path_is_mount_point(where, true);
748                 if (t < 0) {
749                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
750
751                         if (r == 0)
752                                 r = t;
753
754                         continue;
755                 }
756
757                 /* Skip this entry if it is not a remount. */
758                 if (mount_table[k].what && t > 0)
759                         continue;
760
761                 t = mkdir_p(where, 0755);
762                 if (t < 0) {
763                         if (mount_table[k].fatal) {
764                                log_error_errno(t, "Failed to create directory %s: %m", where);
765
766                                 if (r == 0)
767                                         r = t;
768                         } else
769                                log_warning_errno(t, "Failed to create directory %s: %m", where);
770
771                         continue;
772                 }
773
774 #ifdef HAVE_SELINUX
775                 if (arg_selinux_apifs_context &&
776                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
777                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
778                         if (!options)
779                                 return log_oom();
780
781                         o = options;
782                 } else
783 #endif
784                         o = mount_table[k].options;
785
786
787                 if (mount(mount_table[k].what,
788                           where,
789                           mount_table[k].type,
790                           mount_table[k].flags,
791                           o) < 0) {
792
793                         if (mount_table[k].fatal) {
794                                 log_error_errno(errno, "mount(%s) failed: %m", where);
795
796                                 if (r == 0)
797                                         r = -errno;
798                         } else
799                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
800                 }
801         }
802
803         return r;
804 }
805
806 static int mount_binds(const char *dest, char **l, bool ro) {
807         char **x, **y;
808
809         STRV_FOREACH_PAIR(x, y, l) {
810                 _cleanup_free_ char *where = NULL;
811                 struct stat source_st, dest_st;
812                 int r;
813
814                 if (stat(*x, &source_st) < 0)
815                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
816
817                 where = strappend(dest, *y);
818                 if (!where)
819                         return log_oom();
820
821                 r = stat(where, &dest_st);
822                 if (r == 0) {
823                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
824                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
825                                 return -EINVAL;
826                         }
827                 } else if (errno == ENOENT) {
828                         r = mkdir_parents_label(where, 0755);
829                         if (r < 0)
830                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
831                 } else {
832                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
833                         return -errno;
834                 }
835
836                 /* Create the mount point, but be conservative -- refuse to create block
837                  * and char devices. */
838                 if (S_ISDIR(source_st.st_mode)) {
839                         r = mkdir_label(where, 0755);
840                         if (r < 0 && errno != EEXIST)
841                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
842                 } else if (S_ISFIFO(source_st.st_mode)) {
843                         r = mkfifo(where, 0644);
844                         if (r < 0 && errno != EEXIST)
845                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
846                 } else if (S_ISSOCK(source_st.st_mode)) {
847                         r = mknod(where, 0644 | S_IFSOCK, 0);
848                         if (r < 0 && errno != EEXIST)
849                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
850                 } else if (S_ISREG(source_st.st_mode)) {
851                         r = touch(where);
852                         if (r < 0)
853                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
854                 } else {
855                         log_error("Refusing to create mountpoint for file: %s", *x);
856                         return -ENOTSUP;
857                 }
858
859                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
860                         return log_error_errno(errno, "mount(%s) failed: %m", where);
861
862                 if (ro) {
863                         r = bind_remount_recursive(where, true);
864                         if (r < 0)
865                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
866                 }
867         }
868
869         return 0;
870 }
871
872 static int mount_cgroup_hierarchy(const char *dest, const char *controller, const char *hierarchy, bool read_only) {
873         char *to;
874         int r;
875
876         to = strappenda(dest, "/sys/fs/cgroup/", hierarchy);
877
878         r = path_is_mount_point(to, false);
879         if (r < 0)
880                 return log_error_errno(r, "Failed to determine if %s is mounted already: %m", to);
881         if (r > 0)
882                 return 0;
883
884         mkdir_p(to, 0755);
885
886         if (mount("cgroup", to, "cgroup", MS_NOSUID|MS_NOEXEC|MS_NODEV|(read_only ? MS_RDONLY : 0), controller) < 0)
887                 return log_error_errno(errno, "Failed to mount to %s: %m", to);
888
889         return 1;
890 }
891
892 static int mount_cgroup(const char *dest) {
893         _cleanup_set_free_free_ Set *controllers = NULL;
894         _cleanup_free_ char *own_cgroup_path = NULL;
895         const char *cgroup_root, *systemd_root, *systemd_own;
896         int r;
897
898         controllers = set_new(&string_hash_ops);
899         if (!controllers)
900                 return log_oom();
901
902         r = cg_kernel_controllers(controllers);
903         if (r < 0)
904                 return log_error_errno(r, "Failed to determine cgroup controllers: %m");
905
906         r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
907         if (r < 0)
908                 return log_error_errno(r, "Failed to determine our own cgroup path: %m");
909
910         cgroup_root = strappenda(dest, "/sys/fs/cgroup");
911         if (mount("tmpfs", cgroup_root, "tmpfs", MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, "mode=755") < 0)
912                 return log_error_errno(errno, "Failed to mount tmpfs to /sys/fs/cgroup: %m");
913
914         for (;;) {
915                 _cleanup_free_ char *controller = NULL, *origin = NULL, *combined = NULL;
916
917                 controller = set_steal_first(controllers);
918                 if (!controller)
919                         break;
920
921                 origin = strappend("/sys/fs/cgroup/", controller);
922                 if (!origin)
923                         return log_oom();
924
925                 r = readlink_malloc(origin, &combined);
926                 if (r == -EINVAL) {
927                         /* Not a symbolic link, but directly a single cgroup hierarchy */
928
929                         r = mount_cgroup_hierarchy(dest, controller, controller, true);
930                         if (r < 0)
931                                 return r;
932
933                 } else if (r < 0)
934                         return log_error_errno(r, "Failed to read link %s: %m", origin);
935                 else {
936                         _cleanup_free_ char *target = NULL;
937
938                         target = strjoin(dest, "/sys/fs/cgroup/", controller, NULL);
939                         if (!target)
940                                 return log_oom();
941
942                         /* A symbolic link, a combination of controllers in one hierarchy */
943
944                         if (!filename_is_valid(combined)) {
945                                 log_warning("Ignoring invalid combined hierarchy %s.", combined);
946                                 continue;
947                         }
948
949                         r = mount_cgroup_hierarchy(dest, combined, combined, true);
950                         if (r < 0)
951                                 return r;
952
953                         if (symlink(combined, target) < 0)
954                                 return log_error_errno(errno, "Failed to create symlink for combined hiearchy: %m");
955                 }
956         }
957
958         r = mount_cgroup_hierarchy(dest, "name=systemd", "systemd", false);
959         if (r < 0)
960                 return r;
961
962         /* Make our own cgroup a (writable) bind mount */
963         systemd_own = strappenda(dest, "/sys/fs/cgroup/systemd", own_cgroup_path);
964         if (mount(systemd_own, systemd_own,  NULL, MS_BIND, NULL) < 0)
965                 return log_error_errno(errno, "Failed to turn %s into a bind mount: %m", own_cgroup_path);
966
967         /* And then remount the systemd cgroup root read-only */
968         systemd_root = strappenda(dest, "/sys/fs/cgroup/systemd");
969         if (mount(NULL, systemd_root, NULL, MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL) < 0)
970                 return log_error_errno(errno, "Failed to mount cgroup root read-only: %m");
971
972         if (mount(NULL, cgroup_root, NULL, MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755") < 0)
973                 return log_error_errno(errno, "Failed to remount %s read-only: %m", cgroup_root);
974
975         return 0;
976 }
977
978 static int mount_tmpfs(const char *dest) {
979         char **i, **o;
980
981         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
982                 _cleanup_free_ char *where = NULL;
983                 int r;
984
985                 where = strappend(dest, *i);
986                 if (!where)
987                         return log_oom();
988
989                 r = mkdir_label(where, 0755);
990                 if (r < 0 && r != -EEXIST)
991                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
992
993                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
994                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
995         }
996
997         return 0;
998 }
999
1000 static int setup_timezone(const char *dest) {
1001         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
1002         char *z, *y;
1003         int r;
1004
1005         assert(dest);
1006
1007         /* Fix the timezone, if possible */
1008         r = readlink_malloc("/etc/localtime", &p);
1009         if (r < 0) {
1010                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
1011                 return 0;
1012         }
1013
1014         z = path_startswith(p, "../usr/share/zoneinfo/");
1015         if (!z)
1016                 z = path_startswith(p, "/usr/share/zoneinfo/");
1017         if (!z) {
1018                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1019                 return 0;
1020         }
1021
1022         where = strappend(dest, "/etc/localtime");
1023         if (!where)
1024                 return log_oom();
1025
1026         r = readlink_malloc(where, &q);
1027         if (r >= 0) {
1028                 y = path_startswith(q, "../usr/share/zoneinfo/");
1029                 if (!y)
1030                         y = path_startswith(q, "/usr/share/zoneinfo/");
1031
1032                 /* Already pointing to the right place? Then do nothing .. */
1033                 if (y && streq(y, z))
1034                         return 0;
1035         }
1036
1037         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
1038         if (!check)
1039                 return log_oom();
1040
1041         if (access(check, F_OK) < 0) {
1042                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
1043                 return 0;
1044         }
1045
1046         what = strappend("../usr/share/zoneinfo/", z);
1047         if (!what)
1048                 return log_oom();
1049
1050         r = mkdir_parents(where, 0755);
1051         if (r < 0) {
1052                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
1053
1054                 return 0;
1055         }
1056
1057         r = unlink(where);
1058         if (r < 0 && errno != ENOENT) {
1059                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
1060
1061                 return 0;
1062         }
1063
1064         if (symlink(what, where) < 0) {
1065                 log_error_errno(errno, "Failed to correct timezone of container: %m");
1066                 return 0;
1067         }
1068
1069         return 0;
1070 }
1071
1072 static int setup_resolv_conf(const char *dest) {
1073         _cleanup_free_ char *where = NULL;
1074         int r;
1075
1076         assert(dest);
1077
1078         if (arg_private_network)
1079                 return 0;
1080
1081         /* Fix resolv.conf, if possible */
1082         where = strappend(dest, "/etc/resolv.conf");
1083         if (!where)
1084                 return log_oom();
1085
1086         /* We don't really care for the results of this really. If it
1087          * fails, it fails, but meh... */
1088         r = mkdir_parents(where, 0755);
1089         if (r < 0) {
1090                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
1091
1092                 return 0;
1093         }
1094
1095         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
1096         if (r < 0) {
1097                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
1098
1099                 return 0;
1100         }
1101
1102         return 0;
1103 }
1104
1105 static int setup_volatile_state(const char *directory) {
1106         const char *p;
1107         int r;
1108
1109         assert(directory);
1110
1111         if (arg_volatile != VOLATILE_STATE)
1112                 return 0;
1113
1114         /* --volatile=state means we simply overmount /var
1115            with a tmpfs, and the rest read-only. */
1116
1117         r = bind_remount_recursive(directory, true);
1118         if (r < 0)
1119                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
1120
1121         p = strappenda(directory, "/var");
1122         r = mkdir(p, 0755);
1123         if (r < 0 && errno != EEXIST)
1124                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1125
1126         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
1127                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
1128
1129         return 0;
1130 }
1131
1132 static int setup_volatile(const char *directory) {
1133         bool tmpfs_mounted = false, bind_mounted = false;
1134         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1135         const char *f, *t;
1136         int r;
1137
1138         assert(directory);
1139
1140         if (arg_volatile != VOLATILE_YES)
1141                 return 0;
1142
1143         /* --volatile=yes means we mount a tmpfs to the root dir, and
1144            the original /usr to use inside it, and that read-only. */
1145
1146         if (!mkdtemp(template))
1147                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1148
1149         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
1150                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
1151                 r = -errno;
1152                 goto fail;
1153         }
1154
1155         tmpfs_mounted = true;
1156
1157         f = strappenda(directory, "/usr");
1158         t = strappenda(template, "/usr");
1159
1160         r = mkdir(t, 0755);
1161         if (r < 0 && errno != EEXIST) {
1162                 log_error_errno(errno, "Failed to create %s: %m", t);
1163                 r = -errno;
1164                 goto fail;
1165         }
1166
1167         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1168                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
1169                 r = -errno;
1170                 goto fail;
1171         }
1172
1173         bind_mounted = true;
1174
1175         r = bind_remount_recursive(t, true);
1176         if (r < 0) {
1177                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1178                 goto fail;
1179         }
1180
1181         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1182                 log_error_errno(errno, "Failed to move root mount: %m");
1183                 r = -errno;
1184                 goto fail;
1185         }
1186
1187         rmdir(template);
1188
1189         return 0;
1190
1191 fail:
1192         if (bind_mounted)
1193                 umount(t);
1194         if (tmpfs_mounted)
1195                 umount(template);
1196         rmdir(template);
1197         return r;
1198 }
1199
1200 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1201
1202         snprintf(s, 37,
1203                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1204                  SD_ID128_FORMAT_VAL(id));
1205
1206         return s;
1207 }
1208
1209 static int setup_boot_id(const char *dest) {
1210         _cleanup_free_ char *from = NULL, *to = NULL;
1211         sd_id128_t rnd = {};
1212         char as_uuid[37];
1213         int r;
1214
1215         assert(dest);
1216
1217         if (arg_share_system)
1218                 return 0;
1219
1220         /* Generate a new randomized boot ID, so that each boot-up of
1221          * the container gets a new one */
1222
1223         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1224         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1225         if (!from || !to)
1226                 return log_oom();
1227
1228         r = sd_id128_randomize(&rnd);
1229         if (r < 0)
1230                 return log_error_errno(r, "Failed to generate random boot id: %m");
1231
1232         id128_format_as_uuid(rnd, as_uuid);
1233
1234         r = write_string_file(from, as_uuid);
1235         if (r < 0)
1236                 return log_error_errno(r, "Failed to write boot id: %m");
1237
1238         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1239                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1240                 r = -errno;
1241         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1242                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1243
1244         unlink(from);
1245         return r;
1246 }
1247
1248 static int copy_devnodes(const char *dest) {
1249
1250         static const char devnodes[] =
1251                 "null\0"
1252                 "zero\0"
1253                 "full\0"
1254                 "random\0"
1255                 "urandom\0"
1256                 "tty\0"
1257                 "net/tun\0";
1258
1259         const char *d;
1260         int r = 0;
1261         _cleanup_umask_ mode_t u;
1262
1263         assert(dest);
1264
1265         u = umask(0000);
1266
1267         NULSTR_FOREACH(d, devnodes) {
1268                 _cleanup_free_ char *from = NULL, *to = NULL;
1269                 struct stat st;
1270
1271                 from = strappend("/dev/", d);
1272                 to = strjoin(dest, "/dev/", d, NULL);
1273                 if (!from || !to)
1274                         return log_oom();
1275
1276                 if (stat(from, &st) < 0) {
1277
1278                         if (errno != ENOENT)
1279                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1280
1281                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1282
1283                         log_error("%s is not a char or block device, cannot copy", from);
1284                         return -EIO;
1285
1286                 } else {
1287                         r = mkdir_parents(to, 0775);
1288                         if (r < 0) {
1289                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1290                                 return -r;
1291                         }
1292
1293                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1294                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1295                 }
1296         }
1297
1298         return r;
1299 }
1300
1301 static int setup_ptmx(const char *dest) {
1302         _cleanup_free_ char *p = NULL;
1303
1304         p = strappend(dest, "/dev/ptmx");
1305         if (!p)
1306                 return log_oom();
1307
1308         if (symlink("pts/ptmx", p) < 0)
1309                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1310
1311         return 0;
1312 }
1313
1314 static int setup_dev_console(const char *dest, const char *console) {
1315         _cleanup_umask_ mode_t u;
1316         const char *to;
1317         struct stat st;
1318         int r;
1319
1320         assert(dest);
1321         assert(console);
1322
1323         u = umask(0000);
1324
1325         if (stat("/dev/null", &st) < 0)
1326                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1327
1328         r = chmod_and_chown(console, 0600, 0, 0);
1329         if (r < 0)
1330                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1331
1332         /* We need to bind mount the right tty to /dev/console since
1333          * ptys can only exist on pts file systems. To have something
1334          * to bind mount things on we create a device node first, and
1335          * use /dev/null for that since we the cgroups device policy
1336          * allows us to create that freely, while we cannot create
1337          * /dev/console. (Note that the major minor doesn't actually
1338          * matter here, since we mount it over anyway). */
1339
1340         to = strappenda(dest, "/dev/console");
1341         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1342                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1343
1344         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1345                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1346
1347         return 0;
1348 }
1349
1350 static int setup_kmsg(const char *dest, int kmsg_socket) {
1351         _cleanup_free_ char *from = NULL, *to = NULL;
1352         int r, fd, k;
1353         _cleanup_umask_ mode_t u;
1354         union {
1355                 struct cmsghdr cmsghdr;
1356                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1357         } control = {};
1358         struct msghdr mh = {
1359                 .msg_control = &control,
1360                 .msg_controllen = sizeof(control),
1361         };
1362         struct cmsghdr *cmsg;
1363
1364         assert(dest);
1365         assert(kmsg_socket >= 0);
1366
1367         u = umask(0000);
1368
1369         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1370          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1371          * on the reading side behave very similar to /proc/kmsg,
1372          * their writing side behaves differently from /dev/kmsg in
1373          * that writing blocks when nothing is reading. In order to
1374          * avoid any problems with containers deadlocking due to this
1375          * we simply make /dev/kmsg unavailable to the container. */
1376         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1377             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1378                 return log_oom();
1379
1380         if (mkfifo(from, 0600) < 0)
1381                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1382
1383         r = chmod_and_chown(from, 0600, 0, 0);
1384         if (r < 0)
1385                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1386
1387         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1388                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1389
1390         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1391         if (fd < 0)
1392                 return log_error_errno(errno, "Failed to open fifo: %m");
1393
1394         cmsg = CMSG_FIRSTHDR(&mh);
1395         cmsg->cmsg_level = SOL_SOCKET;
1396         cmsg->cmsg_type = SCM_RIGHTS;
1397         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1398         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1399
1400         mh.msg_controllen = cmsg->cmsg_len;
1401
1402         /* Store away the fd in the socket, so that it stays open as
1403          * long as we run the child */
1404         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1405         safe_close(fd);
1406
1407         if (k < 0)
1408                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1409
1410         /* And now make the FIFO unavailable as /dev/kmsg... */
1411         unlink(from);
1412         return 0;
1413 }
1414
1415 static int setup_hostname(void) {
1416
1417         if (arg_share_system)
1418                 return 0;
1419
1420         if (sethostname_idempotent(arg_machine) < 0)
1421                 return -errno;
1422
1423         return 0;
1424 }
1425
1426 static int setup_journal(const char *directory) {
1427         sd_id128_t machine_id, this_id;
1428         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1429         char *id;
1430         int r;
1431
1432         /* Don't link journals in ephemeral mode */
1433         if (arg_ephemeral)
1434                 return 0;
1435
1436         p = strappend(directory, "/etc/machine-id");
1437         if (!p)
1438                 return log_oom();
1439
1440         r = read_one_line_file(p, &b);
1441         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1442                 return 0;
1443         else if (r < 0)
1444                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1445
1446         id = strstrip(b);
1447         if (isempty(id) && arg_link_journal == LINK_AUTO)
1448                 return 0;
1449
1450         /* Verify validity */
1451         r = sd_id128_from_string(id, &machine_id);
1452         if (r < 0)
1453                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1454
1455         r = sd_id128_get_machine(&this_id);
1456         if (r < 0)
1457                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1458
1459         if (sd_id128_equal(machine_id, this_id)) {
1460                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1461                          "Host and machine ids are equal (%s): refusing to link journals", id);
1462                 if (arg_link_journal == LINK_AUTO)
1463                         return 0;
1464                 return -EEXIST;
1465         }
1466
1467         if (arg_link_journal == LINK_NO)
1468                 return 0;
1469
1470         free(p);
1471         p = strappend("/var/log/journal/", id);
1472         q = strjoin(directory, "/var/log/journal/", id, NULL);
1473         if (!p || !q)
1474                 return log_oom();
1475
1476         if (path_is_mount_point(p, false) > 0) {
1477                 if (arg_link_journal != LINK_AUTO) {
1478                         log_error("%s: already a mount point, refusing to use for journal", p);
1479                         return -EEXIST;
1480                 }
1481
1482                 return 0;
1483         }
1484
1485         if (path_is_mount_point(q, false) > 0) {
1486                 if (arg_link_journal != LINK_AUTO) {
1487                         log_error("%s: already a mount point, refusing to use for journal", q);
1488                         return -EEXIST;
1489                 }
1490
1491                 return 0;
1492         }
1493
1494         r = readlink_and_make_absolute(p, &d);
1495         if (r >= 0) {
1496                 if ((arg_link_journal == LINK_GUEST ||
1497                      arg_link_journal == LINK_AUTO) &&
1498                     path_equal(d, q)) {
1499
1500                         r = mkdir_p(q, 0755);
1501                         if (r < 0)
1502                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1503                         return 0;
1504                 }
1505
1506                 if (unlink(p) < 0)
1507                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1508         } else if (r == -EINVAL) {
1509
1510                 if (arg_link_journal == LINK_GUEST &&
1511                     rmdir(p) < 0) {
1512
1513                         if (errno == ENOTDIR) {
1514                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1515                                 return r;
1516                         } else {
1517                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1518                                 return -errno;
1519                         }
1520                 }
1521         } else if (r != -ENOENT) {
1522                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1523                 return r;
1524         }
1525
1526         if (arg_link_journal == LINK_GUEST) {
1527
1528                 if (symlink(q, p) < 0) {
1529                         if (arg_link_journal_try) {
1530                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1531                                 return 0;
1532                         } else {
1533                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1534                                 return -errno;
1535                         }
1536                 }
1537
1538                 r = mkdir_p(q, 0755);
1539                 if (r < 0)
1540                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1541                 return 0;
1542         }
1543
1544         if (arg_link_journal == LINK_HOST) {
1545                 /* don't create parents here -- if the host doesn't have
1546                  * permanent journal set up, don't force it here */
1547                 r = mkdir(p, 0755);
1548                 if (r < 0) {
1549                         if (arg_link_journal_try) {
1550                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1551                                 return 0;
1552                         } else {
1553                                 log_error_errno(errno, "Failed to create %s: %m", p);
1554                                 return r;
1555                         }
1556                 }
1557
1558         } else if (access(p, F_OK) < 0)
1559                 return 0;
1560
1561         if (dir_is_empty(q) == 0)
1562                 log_warning("%s is not empty, proceeding anyway.", q);
1563
1564         r = mkdir_p(q, 0755);
1565         if (r < 0) {
1566                 log_error_errno(errno, "Failed to create %s: %m", q);
1567                 return r;
1568         }
1569
1570         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1571                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1572
1573         return 0;
1574 }
1575
1576 static int drop_capabilities(void) {
1577         return capability_bounding_set_drop(~arg_retain, false);
1578 }
1579
1580 static int register_machine(pid_t pid, int local_ifindex) {
1581         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1582         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1583         int r;
1584
1585         if (!arg_register)
1586                 return 0;
1587
1588         r = sd_bus_default_system(&bus);
1589         if (r < 0)
1590                 return log_error_errno(r, "Failed to open system bus: %m");
1591
1592         if (arg_keep_unit) {
1593                 r = sd_bus_call_method(
1594                                 bus,
1595                                 "org.freedesktop.machine1",
1596                                 "/org/freedesktop/machine1",
1597                                 "org.freedesktop.machine1.Manager",
1598                                 "RegisterMachineWithNetwork",
1599                                 &error,
1600                                 NULL,
1601                                 "sayssusai",
1602                                 arg_machine,
1603                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1604                                 "nspawn",
1605                                 "container",
1606                                 (uint32_t) pid,
1607                                 strempty(arg_directory),
1608                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1609         } else {
1610                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1611
1612                 r = sd_bus_message_new_method_call(
1613                                 bus,
1614                                 &m,
1615                                 "org.freedesktop.machine1",
1616                                 "/org/freedesktop/machine1",
1617                                 "org.freedesktop.machine1.Manager",
1618                                 "CreateMachineWithNetwork");
1619                 if (r < 0)
1620                         return log_error_errno(r, "Failed to create message: %m");
1621
1622                 r = sd_bus_message_append(
1623                                 m,
1624                                 "sayssusai",
1625                                 arg_machine,
1626                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1627                                 "nspawn",
1628                                 "container",
1629                                 (uint32_t) pid,
1630                                 strempty(arg_directory),
1631                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1632                 if (r < 0)
1633                         return log_error_errno(r, "Failed to append message arguments: %m");
1634
1635                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1636                 if (r < 0)
1637                         return log_error_errno(r, "Failed to open container: %m");
1638
1639                 if (!isempty(arg_slice)) {
1640                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1641                         if (r < 0)
1642                                 return log_error_errno(r, "Failed to append slice: %m");
1643                 }
1644
1645                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1646                 if (r < 0)
1647                         return log_error_errno(r, "Failed to add device policy: %m");
1648
1649                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1650                                           /* Allow the container to
1651                                            * access and create the API
1652                                            * device nodes, so that
1653                                            * PrivateDevices= in the
1654                                            * container can work
1655                                            * fine */
1656                                           "/dev/null", "rwm",
1657                                           "/dev/zero", "rwm",
1658                                           "/dev/full", "rwm",
1659                                           "/dev/random", "rwm",
1660                                           "/dev/urandom", "rwm",
1661                                           "/dev/tty", "rwm",
1662                                           "/dev/net/tun", "rwm",
1663                                           /* Allow the container
1664                                            * access to ptys. However,
1665                                            * do not permit the
1666                                            * container to ever create
1667                                            * these device nodes. */
1668                                           "/dev/pts/ptmx", "rw",
1669                                           "char-pts", "rw");
1670                 if (r < 0)
1671                         return log_error_errno(r, "Failed to add device whitelist: %m");
1672
1673                 r = sd_bus_message_close_container(m);
1674                 if (r < 0)
1675                         return log_error_errno(r, "Failed to close container: %m");
1676
1677                 r = sd_bus_call(bus, m, 0, &error, NULL);
1678         }
1679
1680         if (r < 0) {
1681                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1682                 return r;
1683         }
1684
1685         return 0;
1686 }
1687
1688 static int terminate_machine(pid_t pid) {
1689         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1690         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1691         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1692         const char *path;
1693         int r;
1694
1695         if (!arg_register)
1696                 return 0;
1697
1698         r = sd_bus_default_system(&bus);
1699         if (r < 0)
1700                 return log_error_errno(r, "Failed to open system bus: %m");
1701
1702         r = sd_bus_call_method(
1703                         bus,
1704                         "org.freedesktop.machine1",
1705                         "/org/freedesktop/machine1",
1706                         "org.freedesktop.machine1.Manager",
1707                         "GetMachineByPID",
1708                         &error,
1709                         &reply,
1710                         "u",
1711                         (uint32_t) pid);
1712         if (r < 0) {
1713                 /* Note that the machine might already have been
1714                  * cleaned up automatically, hence don't consider it a
1715                  * failure if we cannot get the machine object. */
1716                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1717                 return 0;
1718         }
1719
1720         r = sd_bus_message_read(reply, "o", &path);
1721         if (r < 0)
1722                 return bus_log_parse_error(r);
1723
1724         r = sd_bus_call_method(
1725                         bus,
1726                         "org.freedesktop.machine1",
1727                         path,
1728                         "org.freedesktop.machine1.Machine",
1729                         "Terminate",
1730                         &error,
1731                         NULL,
1732                         NULL);
1733         if (r < 0) {
1734                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1735                 return 0;
1736         }
1737
1738         return 0;
1739 }
1740
1741 static int reset_audit_loginuid(void) {
1742         _cleanup_free_ char *p = NULL;
1743         int r;
1744
1745         if (arg_share_system)
1746                 return 0;
1747
1748         r = read_one_line_file("/proc/self/loginuid", &p);
1749         if (r == -ENOENT)
1750                 return 0;
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1753
1754         /* Already reset? */
1755         if (streq(p, "4294967295"))
1756                 return 0;
1757
1758         r = write_string_file("/proc/self/loginuid", "4294967295");
1759         if (r < 0) {
1760                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1761                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1762                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1763                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1764                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1765
1766                 sleep(5);
1767         }
1768
1769         return 0;
1770 }
1771
1772 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1773 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1774 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1775
1776 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1777         uint8_t result[8];
1778         size_t l, sz;
1779         uint8_t *v, *i;
1780         int r;
1781
1782         l = strlen(arg_machine);
1783         sz = sizeof(sd_id128_t) + l;
1784         if (idx > 0)
1785                 sz += sizeof(idx);
1786
1787         v = alloca(sz);
1788
1789         /* fetch some persistent data unique to the host */
1790         r = sd_id128_get_machine((sd_id128_t*) v);
1791         if (r < 0)
1792                 return r;
1793
1794         /* combine with some data unique (on this host) to this
1795          * container instance */
1796         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1797         if (idx > 0) {
1798                 idx = htole64(idx);
1799                 memcpy(i, &idx, sizeof(idx));
1800         }
1801
1802         /* Let's hash the host machine ID plus the container name. We
1803          * use a fixed, but originally randomly created hash key here. */
1804         siphash24(result, v, sz, hash_key.bytes);
1805
1806         assert_cc(ETH_ALEN <= sizeof(result));
1807         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1808
1809         /* see eth_random_addr in the kernel */
1810         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1811         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1812
1813         return 0;
1814 }
1815
1816 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1817         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1818         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1819         struct ether_addr mac_host, mac_container;
1820         int r, i;
1821
1822         if (!arg_private_network)
1823                 return 0;
1824
1825         if (!arg_network_veth)
1826                 return 0;
1827
1828         /* Use two different interface name prefixes depending whether
1829          * we are in bridge mode or not. */
1830         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1831                  arg_network_bridge ? "vb" : "ve", arg_machine);
1832
1833         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1834         if (r < 0)
1835                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1836
1837         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1838         if (r < 0)
1839                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1840
1841         r = sd_rtnl_open(&rtnl, 0);
1842         if (r < 0)
1843                 return log_error_errno(r, "Failed to connect to netlink: %m");
1844
1845         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1846         if (r < 0)
1847                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1848
1849         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1850         if (r < 0)
1851                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1852
1853         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1854         if (r < 0)
1855                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1856
1857         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1858         if (r < 0)
1859                 return log_error_errno(r, "Failed to open netlink container: %m");
1860
1861         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1862         if (r < 0)
1863                 return log_error_errno(r, "Failed to open netlink container: %m");
1864
1865         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1866         if (r < 0)
1867                 return log_error_errno(r, "Failed to open netlink container: %m");
1868
1869         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1870         if (r < 0)
1871                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1872
1873         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1874         if (r < 0)
1875                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1876
1877         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1878         if (r < 0)
1879                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1880
1881         r = sd_rtnl_message_close_container(m);
1882         if (r < 0)
1883                 return log_error_errno(r, "Failed to close netlink container: %m");
1884
1885         r = sd_rtnl_message_close_container(m);
1886         if (r < 0)
1887                 return log_error_errno(r, "Failed to close netlink container: %m");
1888
1889         r = sd_rtnl_message_close_container(m);
1890         if (r < 0)
1891                 return log_error_errno(r, "Failed to close netlink container: %m");
1892
1893         r = sd_rtnl_call(rtnl, m, 0, NULL);
1894         if (r < 0)
1895                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1896
1897         i = (int) if_nametoindex(iface_name);
1898         if (i <= 0)
1899                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1900
1901         *ifi = i;
1902
1903         return 0;
1904 }
1905
1906 static int setup_bridge(const char veth_name[], int *ifi) {
1907         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1908         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1909         int r, bridge;
1910
1911         if (!arg_private_network)
1912                 return 0;
1913
1914         if (!arg_network_veth)
1915                 return 0;
1916
1917         if (!arg_network_bridge)
1918                 return 0;
1919
1920         bridge = (int) if_nametoindex(arg_network_bridge);
1921         if (bridge <= 0)
1922                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1923
1924         *ifi = bridge;
1925
1926         r = sd_rtnl_open(&rtnl, 0);
1927         if (r < 0)
1928                 return log_error_errno(r, "Failed to connect to netlink: %m");
1929
1930         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1931         if (r < 0)
1932                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1933
1934         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1935         if (r < 0)
1936                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1937
1938         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1939         if (r < 0)
1940                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1941
1942         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1943         if (r < 0)
1944                 return log_error_errno(r, "Failed to add netlink master field: %m");
1945
1946         r = sd_rtnl_call(rtnl, m, 0, NULL);
1947         if (r < 0)
1948                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1949
1950         return 0;
1951 }
1952
1953 static int parse_interface(struct udev *udev, const char *name) {
1954         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1955         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1956         int ifi;
1957
1958         ifi = (int) if_nametoindex(name);
1959         if (ifi <= 0)
1960                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1961
1962         sprintf(ifi_str, "n%i", ifi);
1963         d = udev_device_new_from_device_id(udev, ifi_str);
1964         if (!d)
1965                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1966
1967         if (udev_device_get_is_initialized(d) <= 0) {
1968                 log_error("Network interface %s is not initialized yet.", name);
1969                 return -EBUSY;
1970         }
1971
1972         return ifi;
1973 }
1974
1975 static int move_network_interfaces(pid_t pid) {
1976         _cleanup_udev_unref_ struct udev *udev = NULL;
1977         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1978         char **i;
1979         int r;
1980
1981         if (!arg_private_network)
1982                 return 0;
1983
1984         if (strv_isempty(arg_network_interfaces))
1985                 return 0;
1986
1987         r = sd_rtnl_open(&rtnl, 0);
1988         if (r < 0)
1989                 return log_error_errno(r, "Failed to connect to netlink: %m");
1990
1991         udev = udev_new();
1992         if (!udev) {
1993                 log_error("Failed to connect to udev.");
1994                 return -ENOMEM;
1995         }
1996
1997         STRV_FOREACH(i, arg_network_interfaces) {
1998                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1999                 int ifi;
2000
2001                 ifi = parse_interface(udev, *i);
2002                 if (ifi < 0)
2003                         return ifi;
2004
2005                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
2006                 if (r < 0)
2007                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2008
2009                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2010                 if (r < 0)
2011                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
2012
2013                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2014                 if (r < 0)
2015                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
2016         }
2017
2018         return 0;
2019 }
2020
2021 static int setup_macvlan(pid_t pid) {
2022         _cleanup_udev_unref_ struct udev *udev = NULL;
2023         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
2024         unsigned idx = 0;
2025         char **i;
2026         int r;
2027
2028         if (!arg_private_network)
2029                 return 0;
2030
2031         if (strv_isempty(arg_network_macvlan))
2032                 return 0;
2033
2034         r = sd_rtnl_open(&rtnl, 0);
2035         if (r < 0)
2036                 return log_error_errno(r, "Failed to connect to netlink: %m");
2037
2038         udev = udev_new();
2039         if (!udev) {
2040                 log_error("Failed to connect to udev.");
2041                 return -ENOMEM;
2042         }
2043
2044         STRV_FOREACH(i, arg_network_macvlan) {
2045                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2046                 _cleanup_free_ char *n = NULL;
2047                 struct ether_addr mac;
2048                 int ifi;
2049
2050                 ifi = parse_interface(udev, *i);
2051                 if (ifi < 0)
2052                         return ifi;
2053
2054                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
2055                 if (r < 0)
2056                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
2057
2058                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2059                 if (r < 0)
2060                         return log_error_errno(r, "Failed to allocate netlink message: %m");
2061
2062                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2063                 if (r < 0)
2064                         return log_error_errno(r, "Failed to add netlink interface index: %m");
2065
2066                 n = strappend("mv-", *i);
2067                 if (!n)
2068                         return log_oom();
2069
2070                 strshorten(n, IFNAMSIZ-1);
2071
2072                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2073                 if (r < 0)
2074                         return log_error_errno(r, "Failed to add netlink interface name: %m");
2075
2076                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
2077                 if (r < 0)
2078                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
2079
2080                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2081                 if (r < 0)
2082                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
2083
2084                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2085                 if (r < 0)
2086                         return log_error_errno(r, "Failed to open netlink container: %m");
2087
2088                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2089                 if (r < 0)
2090                         return log_error_errno(r, "Failed to open netlink container: %m");
2091
2092                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2093                 if (r < 0)
2094                         return log_error_errno(r, "Failed to append macvlan mode: %m");
2095
2096                 r = sd_rtnl_message_close_container(m);
2097                 if (r < 0)
2098                         return log_error_errno(r, "Failed to close netlink container: %m");
2099
2100                 r = sd_rtnl_message_close_container(m);
2101                 if (r < 0)
2102                         return log_error_errno(r, "Failed to close netlink container: %m");
2103
2104                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2105                 if (r < 0)
2106                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
2107         }
2108
2109         return 0;
2110 }
2111
2112 static int setup_seccomp(void) {
2113
2114 #ifdef HAVE_SECCOMP
2115         static const int blacklist[] = {
2116                 SCMP_SYS(kexec_load),
2117                 SCMP_SYS(open_by_handle_at),
2118                 SCMP_SYS(init_module),
2119                 SCMP_SYS(finit_module),
2120                 SCMP_SYS(delete_module),
2121                 SCMP_SYS(iopl),
2122                 SCMP_SYS(ioperm),
2123                 SCMP_SYS(swapon),
2124                 SCMP_SYS(swapoff),
2125         };
2126
2127         scmp_filter_ctx seccomp;
2128         unsigned i;
2129         int r;
2130
2131         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2132         if (!seccomp)
2133                 return log_oom();
2134
2135         r = seccomp_add_secondary_archs(seccomp);
2136         if (r < 0) {
2137                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
2138                 goto finish;
2139         }
2140
2141         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2142                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2143                 if (r == -EFAULT)
2144                         continue; /* unknown syscall */
2145                 if (r < 0) {
2146                         log_error_errno(r, "Failed to block syscall: %m");
2147                         goto finish;
2148                 }
2149         }
2150
2151         /*
2152            Audit is broken in containers, much of the userspace audit
2153            hookup will fail if running inside a container. We don't
2154            care and just turn off creation of audit sockets.
2155
2156            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2157            with EAFNOSUPPORT which audit userspace uses as indication
2158            that audit is disabled in the kernel.
2159          */
2160
2161         r = seccomp_rule_add(
2162                         seccomp,
2163                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2164                         SCMP_SYS(socket),
2165                         2,
2166                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2167                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2168         if (r < 0) {
2169                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2170                 goto finish;
2171         }
2172
2173         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2174         if (r < 0) {
2175                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2176                 goto finish;
2177         }
2178
2179         r = seccomp_load(seccomp);
2180         if (r < 0)
2181                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2182
2183 finish:
2184         seccomp_release(seccomp);
2185         return r;
2186 #else
2187         return 0;
2188 #endif
2189
2190 }
2191
2192 static int setup_propagate(const char *root) {
2193         const char *p, *q;
2194
2195         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2196         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2197         p = strappenda("/run/systemd/nspawn/propagate/", arg_machine);
2198         (void) mkdir_p(p, 0600);
2199
2200         q = strappenda(root, "/run/systemd/nspawn/incoming");
2201         mkdir_parents(q, 0755);
2202         mkdir_p(q, 0600);
2203
2204         if (mount(p, q, NULL, MS_BIND, NULL) < 0)
2205                 return log_error_errno(errno, "Failed to install propagation bind mount.");
2206
2207         if (mount(NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0)
2208                 return log_error_errno(errno, "Failed to make propagation mount read-only");
2209
2210         return 0;
2211 }
2212
2213 static int setup_image(char **device_path, int *loop_nr) {
2214         struct loop_info64 info = {
2215                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2216         };
2217         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2218         _cleanup_free_ char* loopdev = NULL;
2219         struct stat st;
2220         int r, nr;
2221
2222         assert(device_path);
2223         assert(loop_nr);
2224         assert(arg_image);
2225
2226         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2227         if (fd < 0)
2228                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2229
2230         if (fstat(fd, &st) < 0)
2231                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2232
2233         if (S_ISBLK(st.st_mode)) {
2234                 char *p;
2235
2236                 p = strdup(arg_image);
2237                 if (!p)
2238                         return log_oom();
2239
2240                 *device_path = p;
2241
2242                 *loop_nr = -1;
2243
2244                 r = fd;
2245                 fd = -1;
2246
2247                 return r;
2248         }
2249
2250         if (!S_ISREG(st.st_mode)) {
2251                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2252                 return -EINVAL;
2253         }
2254
2255         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2256         if (control < 0)
2257                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2258
2259         nr = ioctl(control, LOOP_CTL_GET_FREE);
2260         if (nr < 0)
2261                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2262
2263         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2264                 return log_oom();
2265
2266         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2267         if (loop < 0)
2268                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2269
2270         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2271                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2272
2273         if (arg_read_only)
2274                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2275
2276         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2277                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2278
2279         *device_path = loopdev;
2280         loopdev = NULL;
2281
2282         *loop_nr = nr;
2283
2284         r = loop;
2285         loop = -1;
2286
2287         return r;
2288 }
2289
2290 static int dissect_image(
2291                 int fd,
2292                 char **root_device, bool *root_device_rw,
2293                 char **home_device, bool *home_device_rw,
2294                 char **srv_device, bool *srv_device_rw,
2295                 bool *secondary) {
2296
2297 #ifdef HAVE_BLKID
2298         int home_nr = -1, srv_nr = -1;
2299 #ifdef GPT_ROOT_NATIVE
2300         int root_nr = -1;
2301 #endif
2302 #ifdef GPT_ROOT_SECONDARY
2303         int secondary_root_nr = -1;
2304 #endif
2305
2306         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2307         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2308         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2309         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2310         _cleanup_udev_unref_ struct udev *udev = NULL;
2311         struct udev_list_entry *first, *item;
2312         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2313         const char *pttype = NULL;
2314         blkid_partlist pl;
2315         struct stat st;
2316         int r;
2317
2318         assert(fd >= 0);
2319         assert(root_device);
2320         assert(home_device);
2321         assert(srv_device);
2322         assert(secondary);
2323         assert(arg_image);
2324
2325         b = blkid_new_probe();
2326         if (!b)
2327                 return log_oom();
2328
2329         errno = 0;
2330         r = blkid_probe_set_device(b, fd, 0, 0);
2331         if (r != 0) {
2332                 if (errno == 0)
2333                         return log_oom();
2334
2335                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2336                 return -errno;
2337         }
2338
2339         blkid_probe_enable_partitions(b, 1);
2340         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2341
2342         errno = 0;
2343         r = blkid_do_safeprobe(b);
2344         if (r == -2 || r == 1) {
2345                 log_error("Failed to identify any partition table on %s.\n"
2346                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2347                 return -EINVAL;
2348         } else if (r != 0) {
2349                 if (errno == 0)
2350                         errno = EIO;
2351                 log_error_errno(errno, "Failed to probe: %m");
2352                 return -errno;
2353         }
2354
2355         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2356         if (!streq_ptr(pttype, "gpt")) {
2357                 log_error("Image %s does not carry a GUID Partition Table.\n"
2358                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2359                 return -EINVAL;
2360         }
2361
2362         errno = 0;
2363         pl = blkid_probe_get_partitions(b);
2364         if (!pl) {
2365                 if (errno == 0)
2366                         return log_oom();
2367
2368                 log_error("Failed to list partitions of %s", arg_image);
2369                 return -errno;
2370         }
2371
2372         udev = udev_new();
2373         if (!udev)
2374                 return log_oom();
2375
2376         if (fstat(fd, &st) < 0)
2377                 return log_error_errno(errno, "Failed to stat block device: %m");
2378
2379         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2380         if (!d)
2381                 return log_oom();
2382
2383         e = udev_enumerate_new(udev);
2384         if (!e)
2385                 return log_oom();
2386
2387         r = udev_enumerate_add_match_parent(e, d);
2388         if (r < 0)
2389                 return log_oom();
2390
2391         r = udev_enumerate_scan_devices(e);
2392         if (r < 0)
2393                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2394
2395         first = udev_enumerate_get_list_entry(e);
2396         udev_list_entry_foreach(item, first) {
2397                 _cleanup_udev_device_unref_ struct udev_device *q;
2398                 const char *stype, *node;
2399                 unsigned long long flags;
2400                 sd_id128_t type_id;
2401                 blkid_partition pp;
2402                 dev_t qn;
2403                 int nr;
2404
2405                 errno = 0;
2406                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2407                 if (!q) {
2408                         if (!errno)
2409                                 errno = ENOMEM;
2410
2411                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2412                         return -errno;
2413                 }
2414
2415                 qn = udev_device_get_devnum(q);
2416                 if (major(qn) == 0)
2417                         continue;
2418
2419                 if (st.st_rdev == qn)
2420                         continue;
2421
2422                 node = udev_device_get_devnode(q);
2423                 if (!node)
2424                         continue;
2425
2426                 pp = blkid_partlist_devno_to_partition(pl, qn);
2427                 if (!pp)
2428                         continue;
2429
2430                 flags = blkid_partition_get_flags(pp);
2431                 if (flags & GPT_FLAG_NO_AUTO)
2432                         continue;
2433
2434                 nr = blkid_partition_get_partno(pp);
2435                 if (nr < 0)
2436                         continue;
2437
2438                 stype = blkid_partition_get_type_string(pp);
2439                 if (!stype)
2440                         continue;
2441
2442                 if (sd_id128_from_string(stype, &type_id) < 0)
2443                         continue;
2444
2445                 if (sd_id128_equal(type_id, GPT_HOME)) {
2446
2447                         if (home && nr >= home_nr)
2448                                 continue;
2449
2450                         home_nr = nr;
2451                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2452
2453                         free(home);
2454                         home = strdup(node);
2455                         if (!home)
2456                                 return log_oom();
2457                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2458
2459                         if (srv && nr >= srv_nr)
2460                                 continue;
2461
2462                         srv_nr = nr;
2463                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2464
2465                         free(srv);
2466                         srv = strdup(node);
2467                         if (!srv)
2468                                 return log_oom();
2469                 }
2470 #ifdef GPT_ROOT_NATIVE
2471                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2472
2473                         if (root && nr >= root_nr)
2474                                 continue;
2475
2476                         root_nr = nr;
2477                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2478
2479                         free(root);
2480                         root = strdup(node);
2481                         if (!root)
2482                                 return log_oom();
2483                 }
2484 #endif
2485 #ifdef GPT_ROOT_SECONDARY
2486                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2487
2488                         if (secondary_root && nr >= secondary_root_nr)
2489                                 continue;
2490
2491                         secondary_root_nr = nr;
2492                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2493
2494
2495                         free(secondary_root);
2496                         secondary_root = strdup(node);
2497                         if (!secondary_root)
2498                                 return log_oom();
2499                 }
2500 #endif
2501         }
2502
2503         if (!root && !secondary_root) {
2504                 log_error("Failed to identify root partition in disk image %s.\n"
2505                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2506                 return -EINVAL;
2507         }
2508
2509         if (root) {
2510                 *root_device = root;
2511                 root = NULL;
2512
2513                 *root_device_rw = root_rw;
2514                 *secondary = false;
2515         } else if (secondary_root) {
2516                 *root_device = secondary_root;
2517                 secondary_root = NULL;
2518
2519                 *root_device_rw = secondary_root_rw;
2520                 *secondary = true;
2521         }
2522
2523         if (home) {
2524                 *home_device = home;
2525                 home = NULL;
2526
2527                 *home_device_rw = home_rw;
2528         }
2529
2530         if (srv) {
2531                 *srv_device = srv;
2532                 srv = NULL;
2533
2534                 *srv_device_rw = srv_rw;
2535         }
2536
2537         return 0;
2538 #else
2539         log_error("--image= is not supported, compiled without blkid support.");
2540         return -ENOTSUP;
2541 #endif
2542 }
2543
2544 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2545 #ifdef HAVE_BLKID
2546         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2547         const char *fstype, *p;
2548         int r;
2549
2550         assert(what);
2551         assert(where);
2552
2553         if (arg_read_only)
2554                 rw = false;
2555
2556         if (directory)
2557                 p = strappenda(where, directory);
2558         else
2559                 p = where;
2560
2561         errno = 0;
2562         b = blkid_new_probe_from_filename(what);
2563         if (!b) {
2564                 if (errno == 0)
2565                         return log_oom();
2566                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2567                 return -errno;
2568         }
2569
2570         blkid_probe_enable_superblocks(b, 1);
2571         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2572
2573         errno = 0;
2574         r = blkid_do_safeprobe(b);
2575         if (r == -1 || r == 1) {
2576                 log_error("Cannot determine file system type of %s", what);
2577                 return -EINVAL;
2578         } else if (r != 0) {
2579                 if (errno == 0)
2580                         errno = EIO;
2581                 log_error_errno(errno, "Failed to probe %s: %m", what);
2582                 return -errno;
2583         }
2584
2585         errno = 0;
2586         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2587                 if (errno == 0)
2588                         errno = EINVAL;
2589                 log_error("Failed to determine file system type of %s", what);
2590                 return -errno;
2591         }
2592
2593         if (streq(fstype, "crypto_LUKS")) {
2594                 log_error("nspawn currently does not support LUKS disk images.");
2595                 return -ENOTSUP;
2596         }
2597
2598         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2599                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2600
2601         return 0;
2602 #else
2603         log_error("--image= is not supported, compiled without blkid support.");
2604         return -ENOTSUP;
2605 #endif
2606 }
2607
2608 static int mount_devices(
2609                 const char *where,
2610                 const char *root_device, bool root_device_rw,
2611                 const char *home_device, bool home_device_rw,
2612                 const char *srv_device, bool srv_device_rw) {
2613         int r;
2614
2615         assert(where);
2616
2617         if (root_device) {
2618                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2619                 if (r < 0)
2620                         return log_error_errno(r, "Failed to mount root directory: %m");
2621         }
2622
2623         if (home_device) {
2624                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2625                 if (r < 0)
2626                         return log_error_errno(r, "Failed to mount home directory: %m");
2627         }
2628
2629         if (srv_device) {
2630                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2631                 if (r < 0)
2632                         return log_error_errno(r, "Failed to mount server data directory: %m");
2633         }
2634
2635         return 0;
2636 }
2637
2638 static void loop_remove(int nr, int *image_fd) {
2639         _cleanup_close_ int control = -1;
2640         int r;
2641
2642         if (nr < 0)
2643                 return;
2644
2645         if (image_fd && *image_fd >= 0) {
2646                 r = ioctl(*image_fd, LOOP_CLR_FD);
2647                 if (r < 0)
2648                         log_warning_errno(errno, "Failed to close loop image: %m");
2649                 *image_fd = safe_close(*image_fd);
2650         }
2651
2652         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2653         if (control < 0) {
2654                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2655                 return;
2656         }
2657
2658         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2659         if (r < 0)
2660                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2661 }
2662
2663 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2664         int pipe_fds[2];
2665         pid_t pid;
2666
2667         assert(database);
2668         assert(key);
2669         assert(rpid);
2670
2671         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2672                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2673
2674         pid = fork();
2675         if (pid < 0)
2676                 return log_error_errno(errno, "Failed to fork getent child: %m");
2677         else if (pid == 0) {
2678                 int nullfd;
2679                 char *empty_env = NULL;
2680
2681                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2682                         _exit(EXIT_FAILURE);
2683
2684                 if (pipe_fds[0] > 2)
2685                         safe_close(pipe_fds[0]);
2686                 if (pipe_fds[1] > 2)
2687                         safe_close(pipe_fds[1]);
2688
2689                 nullfd = open("/dev/null", O_RDWR);
2690                 if (nullfd < 0)
2691                         _exit(EXIT_FAILURE);
2692
2693                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2694                         _exit(EXIT_FAILURE);
2695
2696                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2697                         _exit(EXIT_FAILURE);
2698
2699                 if (nullfd > 2)
2700                         safe_close(nullfd);
2701
2702                 reset_all_signal_handlers();
2703                 close_all_fds(NULL, 0);
2704
2705                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2706                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2707                 _exit(EXIT_FAILURE);
2708         }
2709
2710         pipe_fds[1] = safe_close(pipe_fds[1]);
2711
2712         *rpid = pid;
2713
2714         return pipe_fds[0];
2715 }
2716
2717 static int change_uid_gid(char **_home) {
2718         char line[LINE_MAX], *x, *u, *g, *h;
2719         const char *word, *state;
2720         _cleanup_free_ uid_t *uids = NULL;
2721         _cleanup_free_ char *home = NULL;
2722         _cleanup_fclose_ FILE *f = NULL;
2723         _cleanup_close_ int fd = -1;
2724         unsigned n_uids = 0;
2725         size_t sz = 0, l;
2726         uid_t uid;
2727         gid_t gid;
2728         pid_t pid;
2729         int r;
2730
2731         assert(_home);
2732
2733         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2734                 /* Reset everything fully to 0, just in case */
2735
2736                 if (setgroups(0, NULL) < 0)
2737                         return log_error_errno(errno, "setgroups() failed: %m");
2738
2739                 if (setresgid(0, 0, 0) < 0)
2740                         return log_error_errno(errno, "setregid() failed: %m");
2741
2742                 if (setresuid(0, 0, 0) < 0)
2743                         return log_error_errno(errno, "setreuid() failed: %m");
2744
2745                 *_home = NULL;
2746                 return 0;
2747         }
2748
2749         /* First, get user credentials */
2750         fd = spawn_getent("passwd", arg_user, &pid);
2751         if (fd < 0)
2752                 return fd;
2753
2754         f = fdopen(fd, "r");
2755         if (!f)
2756                 return log_oom();
2757         fd = -1;
2758
2759         if (!fgets(line, sizeof(line), f)) {
2760
2761                 if (!ferror(f)) {
2762                         log_error("Failed to resolve user %s.", arg_user);
2763                         return -ESRCH;
2764                 }
2765
2766                 log_error_errno(errno, "Failed to read from getent: %m");
2767                 return -errno;
2768         }
2769
2770         truncate_nl(line);
2771
2772         wait_for_terminate_and_warn("getent passwd", pid, true);
2773
2774         x = strchr(line, ':');
2775         if (!x) {
2776                 log_error("/etc/passwd entry has invalid user field.");
2777                 return -EIO;
2778         }
2779
2780         u = strchr(x+1, ':');
2781         if (!u) {
2782                 log_error("/etc/passwd entry has invalid password field.");
2783                 return -EIO;
2784         }
2785
2786         u++;
2787         g = strchr(u, ':');
2788         if (!g) {
2789                 log_error("/etc/passwd entry has invalid UID field.");
2790                 return -EIO;
2791         }
2792
2793         *g = 0;
2794         g++;
2795         x = strchr(g, ':');
2796         if (!x) {
2797                 log_error("/etc/passwd entry has invalid GID field.");
2798                 return -EIO;
2799         }
2800
2801         *x = 0;
2802         h = strchr(x+1, ':');
2803         if (!h) {
2804                 log_error("/etc/passwd entry has invalid GECOS field.");
2805                 return -EIO;
2806         }
2807
2808         h++;
2809         x = strchr(h, ':');
2810         if (!x) {
2811                 log_error("/etc/passwd entry has invalid home directory field.");
2812                 return -EIO;
2813         }
2814
2815         *x = 0;
2816
2817         r = parse_uid(u, &uid);
2818         if (r < 0) {
2819                 log_error("Failed to parse UID of user.");
2820                 return -EIO;
2821         }
2822
2823         r = parse_gid(g, &gid);
2824         if (r < 0) {
2825                 log_error("Failed to parse GID of user.");
2826                 return -EIO;
2827         }
2828
2829         home = strdup(h);
2830         if (!home)
2831                 return log_oom();
2832
2833         /* Second, get group memberships */
2834         fd = spawn_getent("initgroups", arg_user, &pid);
2835         if (fd < 0)
2836                 return fd;
2837
2838         fclose(f);
2839         f = fdopen(fd, "r");
2840         if (!f)
2841                 return log_oom();
2842         fd = -1;
2843
2844         if (!fgets(line, sizeof(line), f)) {
2845                 if (!ferror(f)) {
2846                         log_error("Failed to resolve user %s.", arg_user);
2847                         return -ESRCH;
2848                 }
2849
2850                 log_error_errno(errno, "Failed to read from getent: %m");
2851                 return -errno;
2852         }
2853
2854         truncate_nl(line);
2855
2856         wait_for_terminate_and_warn("getent initgroups", pid, true);
2857
2858         /* Skip over the username and subsequent separator whitespace */
2859         x = line;
2860         x += strcspn(x, WHITESPACE);
2861         x += strspn(x, WHITESPACE);
2862
2863         FOREACH_WORD(word, l, x, state) {
2864                 char c[l+1];
2865
2866                 memcpy(c, word, l);
2867                 c[l] = 0;
2868
2869                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2870                         return log_oom();
2871
2872                 r = parse_uid(c, &uids[n_uids++]);
2873                 if (r < 0) {
2874                         log_error("Failed to parse group data from getent.");
2875                         return -EIO;
2876                 }
2877         }
2878
2879         r = mkdir_parents(home, 0775);
2880         if (r < 0)
2881                 return log_error_errno(r, "Failed to make home root directory: %m");
2882
2883         r = mkdir_safe(home, 0755, uid, gid);
2884         if (r < 0 && r != -EEXIST)
2885                 return log_error_errno(r, "Failed to make home directory: %m");
2886
2887         fchown(STDIN_FILENO, uid, gid);
2888         fchown(STDOUT_FILENO, uid, gid);
2889         fchown(STDERR_FILENO, uid, gid);
2890
2891         if (setgroups(n_uids, uids) < 0)
2892                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2893
2894         if (setresgid(gid, gid, gid) < 0)
2895                 return log_error_errno(errno, "setregid() failed: %m");
2896
2897         if (setresuid(uid, uid, uid) < 0)
2898                 return log_error_errno(errno, "setreuid() failed: %m");
2899
2900         if (_home) {
2901                 *_home = home;
2902                 home = NULL;
2903         }
2904
2905         return 0;
2906 }
2907
2908 /*
2909  * Return values:
2910  * < 0 : wait_for_terminate() failed to get the state of the
2911  *       container, the container was terminated by a signal, or
2912  *       failed for an unknown reason.  No change is made to the
2913  *       container argument.
2914  * > 0 : The program executed in the container terminated with an
2915  *       error.  The exit code of the program executed in the
2916  *       container is returned.  The container argument has been set
2917  *       to CONTAINER_TERMINATED.
2918  *   0 : The container is being rebooted, has been shut down or exited
2919  *       successfully.  The container argument has been set to either
2920  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2921  *
2922  * That is, success is indicated by a return value of zero, and an
2923  * error is indicated by a non-zero value.
2924  */
2925 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2926         siginfo_t status;
2927         int r;
2928
2929         r = wait_for_terminate(pid, &status);
2930         if (r < 0)
2931                 return log_warning_errno(r, "Failed to wait for container: %m");
2932
2933         switch (status.si_code) {
2934
2935         case CLD_EXITED:
2936                 if (status.si_status == 0) {
2937                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2938
2939                 } else
2940                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2941
2942                 *container = CONTAINER_TERMINATED;
2943                 return status.si_status;
2944
2945         case CLD_KILLED:
2946                 if (status.si_status == SIGINT) {
2947
2948                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2949                         *container = CONTAINER_TERMINATED;
2950                         return 0;
2951
2952                 } else if (status.si_status == SIGHUP) {
2953
2954                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2955                         *container = CONTAINER_REBOOTED;
2956                         return 0;
2957                 }
2958
2959                 /* CLD_KILLED fallthrough */
2960
2961         case CLD_DUMPED:
2962                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2963                 return -EIO;
2964
2965         default:
2966                 log_error("Container %s failed due to unknown reason.", arg_machine);
2967                 return -EIO;
2968         }
2969
2970         return r;
2971 }
2972
2973 static void nop_handler(int sig) {}
2974
2975 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2976         pid_t pid;
2977
2978         pid = PTR_TO_UINT32(userdata);
2979         if (pid > 0) {
2980                 if (kill(pid, SIGRTMIN+3) >= 0) {
2981                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2982                         sd_event_source_set_userdata(s, NULL);
2983                         return 0;
2984                 }
2985         }
2986
2987         sd_event_exit(sd_event_source_get_event(s), 0);
2988         return 0;
2989 }
2990
2991 static int determine_names(void) {
2992         int r;
2993
2994         if (!arg_image && !arg_directory) {
2995                 if (arg_machine) {
2996                         _cleanup_(image_unrefp) Image *i = NULL;
2997
2998                         r = image_find(arg_machine, &i);
2999                         if (r < 0)
3000                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3001                         else if (r == 0) {
3002                                 log_error("No image for machine '%s': %m", arg_machine);
3003                                 return -ENOENT;
3004                         }
3005
3006                         if (i->type == IMAGE_GPT)
3007                                 r = set_sanitized_path(&arg_image, i->path);
3008                         else
3009                                 r = set_sanitized_path(&arg_directory, i->path);
3010                         if (r < 0)
3011                                 return log_error_errno(r, "Invalid image directory: %m");
3012
3013                         arg_read_only = arg_read_only || i->read_only;
3014                 } else
3015                         arg_directory = get_current_dir_name();
3016
3017                 if (!arg_directory && !arg_machine) {
3018                         log_error("Failed to determine path, please use -D or -i.");
3019                         return -EINVAL;
3020                 }
3021         }
3022
3023         if (!arg_machine) {
3024                 if (arg_directory && path_equal(arg_directory, "/"))
3025                         arg_machine = gethostname_malloc();
3026                 else
3027                         arg_machine = strdup(basename(arg_image ?: arg_directory));
3028
3029                 if (!arg_machine)
3030                         return log_oom();
3031
3032                 hostname_cleanup(arg_machine, false);
3033                 if (!machine_name_is_valid(arg_machine)) {
3034                         log_error("Failed to determine machine name automatically, please use -M.");
3035                         return -EINVAL;
3036                 }
3037
3038                 if (arg_ephemeral) {
3039                         char *b;
3040
3041                         /* Add a random suffix when this is an
3042                          * ephemeral machine, so that we can run many
3043                          * instances at once without manually having
3044                          * to specify -M each time. */
3045
3046                         if (asprintf(&b, "%s-%016" PRIx64, arg_machine, random_u64()) < 0)
3047                                 return log_oom();
3048
3049                         free(arg_machine);
3050                         arg_machine = b;
3051                 }
3052         }
3053
3054         return 0;
3055 }
3056
3057 int main(int argc, char *argv[]) {
3058
3059         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL, *console = NULL;
3060         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
3061         _cleanup_close_ int master = -1, image_fd = -1;
3062         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
3063         _cleanup_fdset_free_ FDSet *fds = NULL;
3064         int r, n_fd_passed, loop_nr = -1;
3065         char veth_name[IFNAMSIZ];
3066         bool secondary = false, remove_subvol = false;
3067         sigset_t mask, mask_chld;
3068         pid_t pid = 0;
3069         int ret = EXIT_SUCCESS;
3070
3071         log_parse_environment();
3072         log_open();
3073
3074         r = parse_argv(argc, argv);
3075         if (r <= 0)
3076                 goto finish;
3077
3078         r = determine_names();
3079         if (r < 0)
3080                 goto finish;
3081
3082         if (geteuid() != 0) {
3083                 log_error("Need to be root.");
3084                 r = -EPERM;
3085                 goto finish;
3086         }
3087
3088         if (sd_booted() <= 0) {
3089                 log_error("Not running on a systemd system.");
3090                 r = -EINVAL;
3091                 goto finish;
3092         }
3093
3094         log_close();
3095         n_fd_passed = sd_listen_fds(false);
3096         if (n_fd_passed > 0) {
3097                 r = fdset_new_listen_fds(&fds, false);
3098                 if (r < 0) {
3099                         log_error_errno(r, "Failed to collect file descriptors: %m");
3100                         goto finish;
3101                 }
3102         }
3103         fdset_close_others(fds);
3104         log_open();
3105
3106         if (arg_directory) {
3107                 assert(!arg_image);
3108
3109                 if (path_equal(arg_directory, "/") && !arg_ephemeral) {
3110                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral.");
3111                         r = -EINVAL;
3112                         goto finish;
3113                 }
3114
3115                 if (arg_template) {
3116                         r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
3117                         if (r == -EEXIST) {
3118                                 if (!arg_quiet)
3119                                         log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
3120                         } else if (r < 0) {
3121                                 log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
3122                                 goto finish;
3123                         } else {
3124                                 if (!arg_quiet)
3125                                         log_info("Populated %s from template %s.", arg_directory, arg_template);
3126                         }
3127
3128                 } else if (arg_ephemeral) {
3129                         char *np;
3130
3131                         /* If the specified path is a mount point we
3132                          * generate the new snapshot immediately
3133                          * inside it under a random name. However if
3134                          * the specified is not a mount point we
3135                          * create the new snapshot in the parent
3136                          * directory, just next to it. */
3137                         r = path_is_mount_point(arg_directory, false);
3138                         if (r < 0) {
3139                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
3140                                 goto finish;
3141                         }
3142                         if (r > 0)
3143                                 r = tempfn_random_child(arg_directory, &np);
3144                         else
3145                                 r = tempfn_random(arg_directory, &np);
3146                         if (r < 0) {
3147                                 log_error_errno(r, "Failed to generate name for snapshot: %m");
3148                                 goto finish;
3149                         }
3150
3151                         r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
3152                         if (r < 0) {
3153                                 free(np);
3154                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
3155                                 goto finish;
3156                         }
3157
3158                         free(arg_directory);
3159                         arg_directory = np;
3160
3161                         remove_subvol = true;
3162                 }
3163
3164                 if (arg_boot) {
3165                         if (path_is_os_tree(arg_directory) <= 0) {
3166                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3167                                 r = -EINVAL;
3168                                 goto finish;
3169                         }
3170                 } else {
3171                         const char *p;
3172
3173                         p = strappenda(arg_directory,
3174                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3175                         if (access(p, F_OK) < 0) {
3176                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3177                                 r = -EINVAL;
3178                                 goto finish;
3179                         }
3180                 }
3181
3182         } else {
3183                 char template[] = "/tmp/nspawn-root-XXXXXX";
3184
3185                 assert(arg_image);
3186                 assert(!arg_template);
3187
3188                 if (!mkdtemp(template)) {
3189                         log_error_errno(errno, "Failed to create temporary directory: %m");
3190                         r = -errno;
3191                         goto finish;
3192                 }
3193
3194                 arg_directory = strdup(template);
3195                 if (!arg_directory) {
3196                         r = log_oom();
3197                         goto finish;
3198                 }
3199
3200                 image_fd = setup_image(&device_path, &loop_nr);
3201                 if (image_fd < 0) {
3202                         r = image_fd;
3203                         goto finish;
3204                 }
3205
3206                 r = dissect_image(image_fd,
3207                                   &root_device, &root_device_rw,
3208                                   &home_device, &home_device_rw,
3209                                   &srv_device, &srv_device_rw,
3210                                   &secondary);
3211                 if (r < 0)
3212                         goto finish;
3213         }
3214
3215         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3216         if (master < 0) {
3217                 r = log_error_errno(errno, "Failed to acquire pseudo tty: %m");
3218                 goto finish;
3219         }
3220
3221         r = ptsname_malloc(master, &console);
3222         if (r < 0) {
3223                 r = log_error_errno(r, "Failed to determine tty name: %m");
3224                 goto finish;
3225         }
3226
3227         if (!arg_quiet)
3228                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3229                          arg_machine, arg_image ?: arg_directory);
3230
3231         if (unlockpt(master) < 0) {
3232                 r = log_error_errno(errno, "Failed to unlock tty: %m");
3233                 goto finish;
3234         }
3235
3236         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3237                 r = log_error_errno(errno, "Failed to create kmsg socket pair: %m");
3238                 goto finish;
3239         }
3240
3241         assert_se(sigemptyset(&mask) == 0);
3242         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3243         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3244
3245         assert_se(sigemptyset(&mask_chld) == 0);
3246         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3247
3248         for (;;) {
3249                 ContainerStatus container_status;
3250                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3251                 struct sigaction sa = {
3252                         .sa_handler = nop_handler,
3253                         .sa_flags = SA_NOCLDSTOP,
3254                 };
3255
3256                 r = barrier_create(&barrier);
3257                 if (r < 0) {
3258                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3259                         goto finish;
3260                 }
3261
3262                 /* Child can be killed before execv(), so handle SIGCHLD
3263                  * in order to interrupt parent's blocking calls and
3264                  * give it a chance to call wait() and terminate. */
3265                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3266                 if (r < 0) {
3267                         r = log_error_errno(errno, "Failed to change the signal mask: %m");
3268                         goto finish;
3269                 }
3270
3271                 r = sigaction(SIGCHLD, &sa, NULL);
3272                 if (r < 0) {
3273                         r = log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
3274                         goto finish;
3275                 }
3276
3277                 pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3278                                 (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3279                                 (arg_private_network ? CLONE_NEWNET : 0), NULL);
3280                 if (pid < 0) {
3281                         if (errno == EINVAL)
3282                                 r = log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3283                         else
3284                                 r = log_error_errno(errno, "clone() failed: %m");
3285
3286                         goto finish;
3287                 }
3288
3289                 if (pid == 0) {
3290                         /* child */
3291                         _cleanup_free_ char *home = NULL;
3292                         unsigned n_env = 2;
3293                         const char *envp[] = {
3294                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3295                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3296                                 NULL, /* TERM */
3297                                 NULL, /* HOME */
3298                                 NULL, /* USER */
3299                                 NULL, /* LOGNAME */
3300                                 NULL, /* container_uuid */
3301                                 NULL, /* LISTEN_FDS */
3302                                 NULL, /* LISTEN_PID */
3303                                 NULL
3304                         };
3305                         char **env_use;
3306
3307                         barrier_set_role(&barrier, BARRIER_CHILD);
3308
3309                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3310                         if (envp[n_env])
3311                                 n_env ++;
3312
3313                         master = safe_close(master);
3314
3315                         close_nointr(STDIN_FILENO);
3316                         close_nointr(STDOUT_FILENO);
3317                         close_nointr(STDERR_FILENO);
3318
3319                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3320
3321                         reset_all_signal_handlers();
3322                         reset_signal_mask();
3323
3324                         r = open_terminal(console, O_RDWR);
3325                         if (r != STDIN_FILENO) {
3326                                 if (r >= 0) {
3327                                         safe_close(r);
3328                                         r = -EINVAL;
3329                                 }
3330
3331                                 log_error_errno(r, "Failed to open console: %m");
3332                                 _exit(EXIT_FAILURE);
3333                         }
3334
3335                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3336                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3337                                 log_error_errno(errno, "Failed to duplicate console: %m");
3338                                 _exit(EXIT_FAILURE);
3339                         }
3340
3341                         if (setsid() < 0) {
3342                                 log_error_errno(errno, "setsid() failed: %m");
3343                                 _exit(EXIT_FAILURE);
3344                         }
3345
3346                         if (reset_audit_loginuid() < 0)
3347                                 _exit(EXIT_FAILURE);
3348
3349                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3350                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3351                                 _exit(EXIT_FAILURE);
3352                         }
3353
3354                         /* Mark everything as slave, so that we still
3355                          * receive mounts from the real root, but don't
3356                          * propagate mounts to the real root. */
3357                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3358                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3359                                 _exit(EXIT_FAILURE);
3360                         }
3361
3362                         if (mount_devices(arg_directory,
3363                                           root_device, root_device_rw,
3364                                           home_device, home_device_rw,
3365                                           srv_device, srv_device_rw) < 0)
3366                                 _exit(EXIT_FAILURE);
3367
3368                         /* Turn directory into bind mount */
3369                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3370                                 log_error_errno(errno, "Failed to make bind mount: %m");
3371                                 _exit(EXIT_FAILURE);
3372                         }
3373
3374                         r = setup_volatile(arg_directory);
3375                         if (r < 0)
3376                                 _exit(EXIT_FAILURE);
3377
3378                         if (setup_volatile_state(arg_directory) < 0)
3379                                 _exit(EXIT_FAILURE);
3380
3381                         r = base_filesystem_create(arg_directory);
3382                         if (r < 0)
3383                                 _exit(EXIT_FAILURE);
3384
3385                         if (arg_read_only) {
3386                                 r = bind_remount_recursive(arg_directory, true);
3387                                 if (r < 0) {
3388                                         log_error_errno(r, "Failed to make tree read-only: %m");
3389                                         _exit(EXIT_FAILURE);
3390                                 }
3391                         }
3392
3393                         if (mount_all(arg_directory) < 0)
3394                                 _exit(EXIT_FAILURE);
3395
3396                         if (copy_devnodes(arg_directory) < 0)
3397                                 _exit(EXIT_FAILURE);
3398
3399                         if (setup_ptmx(arg_directory) < 0)
3400                                 _exit(EXIT_FAILURE);
3401
3402                         dev_setup(arg_directory);
3403
3404                         if (setup_propagate(arg_directory) < 0)
3405                                 _exit(EXIT_FAILURE);
3406
3407                         if (setup_seccomp() < 0)
3408                                 _exit(EXIT_FAILURE);
3409
3410                         if (setup_dev_console(arg_directory, console) < 0)
3411                                 _exit(EXIT_FAILURE);
3412
3413                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3414                                 _exit(EXIT_FAILURE);
3415
3416                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3417
3418                         /* Tell the parent that we are ready, and that
3419                          * it can cgroupify us to that we lack access
3420                          * to certain devices and resources. */
3421                         (void) barrier_place(&barrier);
3422
3423                         if (setup_boot_id(arg_directory) < 0)
3424                                 _exit(EXIT_FAILURE);
3425
3426                         if (setup_timezone(arg_directory) < 0)
3427                                 _exit(EXIT_FAILURE);
3428
3429                         if (setup_resolv_conf(arg_directory) < 0)
3430                                 _exit(EXIT_FAILURE);
3431
3432                         if (setup_journal(arg_directory) < 0)
3433                                 _exit(EXIT_FAILURE);
3434
3435                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3436                                 _exit(EXIT_FAILURE);
3437
3438                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3439                                 _exit(EXIT_FAILURE);
3440
3441                         if (mount_tmpfs(arg_directory) < 0)
3442                                 _exit(EXIT_FAILURE);
3443
3444                         /* Wait until we are cgroup-ified, so that we
3445                          * can mount the right cgroup path writable */
3446                         (void) barrier_sync_next(&barrier);
3447
3448                         if (mount_cgroup(arg_directory) < 0)
3449                                 _exit(EXIT_FAILURE);
3450
3451                         if (chdir(arg_directory) < 0) {
3452                                 log_error_errno(errno, "chdir(%s) failed: %m", arg_directory);
3453                                 _exit(EXIT_FAILURE);
3454                         }
3455
3456                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3457                                 log_error_errno(errno, "mount(MS_MOVE) failed: %m");
3458                                 _exit(EXIT_FAILURE);
3459                         }
3460
3461                         if (chroot(".") < 0) {
3462                                 log_error_errno(errno, "chroot() failed: %m");
3463                                 _exit(EXIT_FAILURE);
3464                         }
3465
3466                         if (chdir("/") < 0) {
3467                                 log_error_errno(errno, "chdir() failed: %m");
3468                                 _exit(EXIT_FAILURE);
3469                         }
3470
3471                         umask(0022);
3472
3473                         if (arg_private_network)
3474                                 loopback_setup();
3475
3476                         if (drop_capabilities() < 0) {
3477                                 log_error_errno(errno, "drop_capabilities() failed: %m");
3478                                 _exit(EXIT_FAILURE);
3479                         }
3480
3481                         r = change_uid_gid(&home);
3482                         if (r < 0)
3483                                 _exit(EXIT_FAILURE);
3484
3485                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3486                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3487                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3488                                 log_oom();
3489                                 _exit(EXIT_FAILURE);
3490                         }
3491
3492                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3493                                 char as_uuid[37];
3494
3495                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3496                                         log_oom();
3497                                         _exit(EXIT_FAILURE);
3498                                 }
3499                         }
3500
3501                         if (fdset_size(fds) > 0) {
3502                                 r = fdset_cloexec(fds, false);
3503                                 if (r < 0) {
3504                                         log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3505                                         _exit(EXIT_FAILURE);
3506                                 }
3507
3508                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3509                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3510                                         log_oom();
3511                                         _exit(EXIT_FAILURE);
3512                                 }
3513                         }
3514
3515                         setup_hostname();
3516
3517                         if (arg_personality != 0xffffffffLU) {
3518                                 if (personality(arg_personality) < 0) {
3519                                         log_error_errno(errno, "personality() failed: %m");
3520                                         _exit(EXIT_FAILURE);
3521                                 }
3522                         } else if (secondary) {
3523                                 if (personality(PER_LINUX32) < 0) {
3524                                         log_error_errno(errno, "personality() failed: %m");
3525                                         _exit(EXIT_FAILURE);
3526                                 }
3527                         }
3528
3529 #ifdef HAVE_SELINUX
3530                         if (arg_selinux_context)
3531                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3532                                         log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3533                                         _exit(EXIT_FAILURE);
3534                                 }
3535 #endif
3536
3537                         if (!strv_isempty(arg_setenv)) {
3538                                 char **n;
3539
3540                                 n = strv_env_merge(2, envp, arg_setenv);
3541                                 if (!n) {
3542                                         log_oom();
3543                                         _exit(EXIT_FAILURE);
3544                                 }
3545
3546                                 env_use = n;
3547                         } else
3548                                 env_use = (char**) envp;
3549
3550                         /* Wait until the parent is ready with the setup, too... */
3551                         if (!barrier_place_and_sync(&barrier))
3552                                 _exit(EXIT_FAILURE);
3553
3554                         if (arg_boot) {
3555                                 char **a;
3556                                 size_t l;
3557
3558                                 /* Automatically search for the init system */
3559
3560                                 l = 1 + argc - optind;
3561                                 a = newa(char*, l + 1);
3562                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3563
3564                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3565                                 execve(a[0], a, env_use);
3566
3567                                 a[0] = (char*) "/lib/systemd/systemd";
3568                                 execve(a[0], a, env_use);
3569
3570                                 a[0] = (char*) "/sbin/init";
3571                                 execve(a[0], a, env_use);
3572                         } else if (argc > optind)
3573                                 execvpe(argv[optind], argv + optind, env_use);
3574                         else {
3575                                 chdir(home ? home : "/root");
3576                                 execle("/bin/bash", "-bash", NULL, env_use);
3577                                 execle("/bin/sh", "-sh", NULL, env_use);
3578                         }
3579
3580                         log_error_errno(errno, "execv() failed: %m");
3581                         _exit(EXIT_FAILURE);
3582                 }
3583
3584                 barrier_set_role(&barrier, BARRIER_PARENT);
3585                 fdset_free(fds);
3586                 fds = NULL;
3587
3588                 /* Wait for the most basic Child-setup to be done,
3589                  * before we add hardware to it, and place it in a
3590                  * cgroup. */
3591                 if (barrier_sync_next(&barrier)) {
3592                         _cleanup_event_unref_ sd_event *event = NULL;
3593                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3594                         char last_char = 0;
3595                         int ifi = 0;
3596
3597                         r = move_network_interfaces(pid);
3598                         if (r < 0)
3599                                 goto finish;
3600
3601                         r = setup_veth(pid, veth_name, &ifi);
3602                         if (r < 0)
3603                                 goto finish;
3604
3605                         r = setup_bridge(veth_name, &ifi);
3606                         if (r < 0)
3607                                 goto finish;
3608
3609                         r = setup_macvlan(pid);
3610                         if (r < 0)
3611                                 goto finish;
3612
3613                         r = register_machine(pid, ifi);
3614                         if (r < 0)
3615                                 goto finish;
3616
3617                         /* Block SIGCHLD here, before notifying child.
3618                          * process_pty() will handle it with the other signals. */
3619                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3620                         if (r < 0)
3621                                 goto finish;
3622
3623                         /* Reset signal to default */
3624                         r = default_signals(SIGCHLD, -1);
3625                         if (r < 0)
3626                                 goto finish;
3627
3628                         /* Notify the child that the parent is ready with all
3629                          * its setup, and that the child can now hand over
3630                          * control to the code to run inside the container. */
3631                         (void) barrier_place(&barrier);
3632
3633                         /* And wait that the child is completely ready now. */
3634                         (void) barrier_place_and_sync(&barrier);
3635
3636                         sd_notify(false,
3637                                   "READY=1\n"
3638                                   "STATUS=Container running.");
3639
3640                         r = sd_event_new(&event);
3641                         if (r < 0) {
3642                                 log_error_errno(r, "Failed to get default event source: %m");
3643                                 goto finish;
3644                         }
3645
3646                         if (arg_boot) {
3647                                 /* Try to kill the init system on SIGINT or SIGTERM */
3648                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3649                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3650                         } else {
3651                                 /* Immediately exit */
3652                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3653                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3654                         }
3655
3656                         /* simply exit on sigchld */
3657                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3658
3659                         r = pty_forward_new(event, master, true, &forward);
3660                         if (r < 0) {
3661                                 log_error_errno(r, "Failed to create PTY forwarder: %m");
3662                                 goto finish;
3663                         }
3664
3665                         r = sd_event_loop(event);
3666                         if (r < 0) {
3667                                 log_error_errno(r, "Failed to run event loop: %m");
3668                                 goto finish;
3669                         }
3670
3671                         pty_forward_last_char(forward, &last_char);
3672
3673                         forward = pty_forward_free(forward);
3674
3675                         if (!arg_quiet && last_char != '\n')
3676                                 putc('\n', stdout);
3677
3678                         /* Kill if it is not dead yet anyway */
3679                         terminate_machine(pid);
3680                 }
3681
3682                 /* Normally redundant, but better safe than sorry */
3683                 kill(pid, SIGKILL);
3684
3685                 r = wait_for_container(pid, &container_status);
3686                 pid = 0;
3687
3688                 if (r < 0)
3689                         /* We failed to wait for the container, or the
3690                          * container exited abnormally */
3691                         goto finish;
3692                 else if (r > 0 || container_status == CONTAINER_TERMINATED){
3693                         /* The container exited with a non-zero
3694                          * status, or with zero status and no reboot
3695                          * was requested. */
3696                         ret = r;
3697                         break;
3698                 }
3699
3700                 /* CONTAINER_REBOOTED, loop again */
3701
3702                 if (arg_keep_unit) {
3703                         /* Special handling if we are running as a
3704                          * service: instead of simply restarting the
3705                          * machine we want to restart the entire
3706                          * service, so let's inform systemd about this
3707                          * with the special exit code 133. The service
3708                          * file uses RestartForceExitStatus=133 so
3709                          * that this results in a full nspawn
3710                          * restart. This is necessary since we might
3711                          * have cgroup parameters set we want to have
3712                          * flushed out. */
3713                         ret = 133;
3714                         r = 0;
3715                         break;
3716                 }
3717         }
3718
3719 finish:
3720         sd_notify(false,
3721                   "STOPPING=1\n"
3722                   "STATUS=Terminating...");
3723
3724         loop_remove(loop_nr, &image_fd);
3725
3726         if (pid > 0)
3727                 kill(pid, SIGKILL);
3728
3729         if (remove_subvol && arg_directory) {
3730                 int k;
3731
3732                 k = btrfs_subvol_remove(arg_directory);
3733                 if (k < 0)
3734                         log_warning_errno(k, "Cannot remove subvolume '%s', ignoring: %m", arg_directory);
3735         }
3736
3737         if (arg_machine) {
3738                 const char *p;
3739
3740                 p = strappenda("/run/systemd/nspawn/propagate", arg_machine);
3741                 (void) rm_rf(p, false, true, false);
3742         }
3743
3744         free(arg_directory);
3745         free(arg_template);
3746         free(arg_image);
3747         free(arg_machine);
3748         free(arg_user);
3749         strv_free(arg_setenv);
3750         strv_free(arg_network_interfaces);
3751         strv_free(arg_network_macvlan);
3752         strv_free(arg_bind);
3753         strv_free(arg_bind_ro);
3754         strv_free(arg_tmpfs);
3755
3756         return r < 0 ? EXIT_FAILURE : ret;
3757 }