chiark / gitweb /
fca3222a87dbc6b2b81bf39759dcc26b41416ed2
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static uint64_t arg_retain =
128         (1ULL << CAP_CHOWN) |
129         (1ULL << CAP_DAC_OVERRIDE) |
130         (1ULL << CAP_DAC_READ_SEARCH) |
131         (1ULL << CAP_FOWNER) |
132         (1ULL << CAP_FSETID) |
133         (1ULL << CAP_IPC_OWNER) |
134         (1ULL << CAP_KILL) |
135         (1ULL << CAP_LEASE) |
136         (1ULL << CAP_LINUX_IMMUTABLE) |
137         (1ULL << CAP_NET_BIND_SERVICE) |
138         (1ULL << CAP_NET_BROADCAST) |
139         (1ULL << CAP_NET_RAW) |
140         (1ULL << CAP_SETGID) |
141         (1ULL << CAP_SETFCAP) |
142         (1ULL << CAP_SETPCAP) |
143         (1ULL << CAP_SETUID) |
144         (1ULL << CAP_SYS_ADMIN) |
145         (1ULL << CAP_SYS_CHROOT) |
146         (1ULL << CAP_SYS_NICE) |
147         (1ULL << CAP_SYS_PTRACE) |
148         (1ULL << CAP_SYS_TTY_CONFIG) |
149         (1ULL << CAP_SYS_RESOURCE) |
150         (1ULL << CAP_SYS_BOOT) |
151         (1ULL << CAP_AUDIT_WRITE) |
152         (1ULL << CAP_AUDIT_CONTROL) |
153         (1ULL << CAP_MKNOD);
154 static char **arg_bind = NULL;
155 static char **arg_bind_ro = NULL;
156 static char **arg_tmpfs = NULL;
157 static char **arg_setenv = NULL;
158 static bool arg_quiet = false;
159 static bool arg_share_system = false;
160 static bool arg_register = true;
161 static bool arg_keep_unit = false;
162 static char **arg_network_interfaces = NULL;
163 static char **arg_network_macvlan = NULL;
164 static bool arg_network_veth = false;
165 static const char *arg_network_bridge = NULL;
166 static unsigned long arg_personality = 0xffffffffLU;
167 static const char *arg_image = NULL;
168 static Volatile arg_volatile = VOLATILE_NO;
169
170 static void help(void) {
171         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
172                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
173                "  -h --help                 Show this help\n"
174                "     --version              Print version string\n"
175                "  -q --quiet                Do not show status information\n"
176                "  -D --directory=PATH       Root directory for the container\n"
177                "  -i --image=PATH           File system device or image for the container\n"
178                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
179                "  -u --user=USER            Run the command under specified user or uid\n"
180                "  -M --machine=NAME         Set the machine name for the container\n"
181                "     --uuid=UUID            Set a specific machine UUID for the container\n"
182                "  -S --slice=SLICE          Place the container in the specified slice\n"
183                "     --private-network      Disable network in container\n"
184                "     --network-interface=INTERFACE\n"
185                "                            Assign an existing network interface to the\n"
186                "                            container\n"
187                "     --network-macvlan=INTERFACE\n"
188                "                            Create a macvlan network interface based on an\n"
189                "                            existing network interface to the container\n"
190                "     --network-veth         Add a virtual ethernet connection between host\n"
191                "                            and container\n"
192                "     --network-bridge=INTERFACE\n"
193                "                            Add a virtual ethernet connection between host\n"
194                "                            and container and add it to an existing bridge on\n"
195                "                            the host\n"
196                "  -Z --selinux-context=SECLABEL\n"
197                "                            Set the SELinux security context to be used by\n"
198                "                            processes in the container\n"
199                "  -L --selinux-apifs-context=SECLABEL\n"
200                "                            Set the SELinux security context to be used by\n"
201                "                            API/tmpfs file systems in the container\n"
202                "     --capability=CAP       In addition to the default, retain specified\n"
203                "                            capability\n"
204                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
205                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
206                "  -j                        Equivalent to --link-journal=host\n"
207                "     --read-only            Mount the root directory read-only\n"
208                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
209                "                            the container\n"
210                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
211                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
212                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
213                "     --share-system         Share system namespaces with host\n"
214                "     --register=BOOLEAN     Register container as machine\n"
215                "     --keep-unit            Do not register a scope for the machine, reuse\n"
216                "                            the service unit nspawn is running in\n"
217                "     --volatile[=MODE]      Run the system in volatile mode\n",
218                program_invocation_short_name);
219 }
220
221 static int parse_argv(int argc, char *argv[]) {
222
223         enum {
224                 ARG_VERSION = 0x100,
225                 ARG_PRIVATE_NETWORK,
226                 ARG_UUID,
227                 ARG_READ_ONLY,
228                 ARG_CAPABILITY,
229                 ARG_DROP_CAPABILITY,
230                 ARG_LINK_JOURNAL,
231                 ARG_BIND,
232                 ARG_BIND_RO,
233                 ARG_TMPFS,
234                 ARG_SETENV,
235                 ARG_SHARE_SYSTEM,
236                 ARG_REGISTER,
237                 ARG_KEEP_UNIT,
238                 ARG_NETWORK_INTERFACE,
239                 ARG_NETWORK_MACVLAN,
240                 ARG_NETWORK_VETH,
241                 ARG_NETWORK_BRIDGE,
242                 ARG_PERSONALITY,
243                 ARG_VOLATILE,
244         };
245
246         static const struct option options[] = {
247                 { "help",                  no_argument,       NULL, 'h'                   },
248                 { "version",               no_argument,       NULL, ARG_VERSION           },
249                 { "directory",             required_argument, NULL, 'D'                   },
250                 { "user",                  required_argument, NULL, 'u'                   },
251                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
252                 { "boot",                  no_argument,       NULL, 'b'                   },
253                 { "uuid",                  required_argument, NULL, ARG_UUID              },
254                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
255                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
256                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
257                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
258                 { "bind",                  required_argument, NULL, ARG_BIND              },
259                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
260                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
261                 { "machine",               required_argument, NULL, 'M'                   },
262                 { "slice",                 required_argument, NULL, 'S'                   },
263                 { "setenv",                required_argument, NULL, ARG_SETENV            },
264                 { "selinux-context",       required_argument, NULL, 'Z'                   },
265                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
266                 { "quiet",                 no_argument,       NULL, 'q'                   },
267                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
268                 { "register",              required_argument, NULL, ARG_REGISTER          },
269                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
270                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
271                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
272                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
273                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
274                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
275                 { "image",                 required_argument, NULL, 'i'                   },
276                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
277                 {}
278         };
279
280         int c, r;
281         uint64_t plus = 0, minus = 0;
282
283         assert(argc >= 0);
284         assert(argv);
285
286         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
287
288                 switch (c) {
289
290                 case 'h':
291                         help();
292                         return 0;
293
294                 case ARG_VERSION:
295                         puts(PACKAGE_STRING);
296                         puts(SYSTEMD_FEATURES);
297                         return 0;
298
299                 case 'D':
300                         free(arg_directory);
301                         arg_directory = canonicalize_file_name(optarg);
302                         if (!arg_directory) {
303                                 log_error("Invalid root directory: %m");
304                                 return -ENOMEM;
305                         }
306
307                         break;
308
309                 case 'i':
310                         arg_image = optarg;
311                         break;
312
313                 case 'u':
314                         free(arg_user);
315                         arg_user = strdup(optarg);
316                         if (!arg_user)
317                                 return log_oom();
318
319                         break;
320
321                 case ARG_NETWORK_BRIDGE:
322                         arg_network_bridge = optarg;
323
324                         /* fall through */
325
326                 case ARG_NETWORK_VETH:
327                         arg_network_veth = true;
328                         arg_private_network = true;
329                         break;
330
331                 case ARG_NETWORK_INTERFACE:
332                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
333                                 return log_oom();
334
335                         arg_private_network = true;
336                         break;
337
338                 case ARG_NETWORK_MACVLAN:
339                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
340                                 return log_oom();
341
342                         /* fall through */
343
344                 case ARG_PRIVATE_NETWORK:
345                         arg_private_network = true;
346                         break;
347
348                 case 'b':
349                         arg_boot = true;
350                         break;
351
352                 case ARG_UUID:
353                         r = sd_id128_from_string(optarg, &arg_uuid);
354                         if (r < 0) {
355                                 log_error("Invalid UUID: %s", optarg);
356                                 return r;
357                         }
358                         break;
359
360                 case 'S':
361                         arg_slice = optarg;
362                         break;
363
364                 case 'M':
365                         if (isempty(optarg)) {
366                                 free(arg_machine);
367                                 arg_machine = NULL;
368                         } else {
369
370                                 if (!hostname_is_valid(optarg)) {
371                                         log_error("Invalid machine name: %s", optarg);
372                                         return -EINVAL;
373                                 }
374
375                                 free(arg_machine);
376                                 arg_machine = strdup(optarg);
377                                 if (!arg_machine)
378                                         return log_oom();
379
380                                 break;
381                         }
382
383                 case 'Z':
384                         arg_selinux_context = optarg;
385                         break;
386
387                 case 'L':
388                         arg_selinux_apifs_context = optarg;
389                         break;
390
391                 case ARG_READ_ONLY:
392                         arg_read_only = true;
393                         break;
394
395                 case ARG_CAPABILITY:
396                 case ARG_DROP_CAPABILITY: {
397                         const char *state, *word;
398                         size_t length;
399
400                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
401                                 _cleanup_free_ char *t;
402                                 cap_value_t cap;
403
404                                 t = strndup(word, length);
405                                 if (!t)
406                                         return log_oom();
407
408                                 if (streq(t, "all")) {
409                                         if (c == ARG_CAPABILITY)
410                                                 plus = (uint64_t) -1;
411                                         else
412                                                 minus = (uint64_t) -1;
413                                 } else {
414                                         if (cap_from_name(t, &cap) < 0) {
415                                                 log_error("Failed to parse capability %s.", t);
416                                                 return -EINVAL;
417                                         }
418
419                                         if (c == ARG_CAPABILITY)
420                                                 plus |= 1ULL << (uint64_t) cap;
421                                         else
422                                                 minus |= 1ULL << (uint64_t) cap;
423                                 }
424                         }
425
426                         break;
427                 }
428
429                 case 'j':
430                         arg_link_journal = LINK_GUEST;
431                         break;
432
433                 case ARG_LINK_JOURNAL:
434                         if (streq(optarg, "auto"))
435                                 arg_link_journal = LINK_AUTO;
436                         else if (streq(optarg, "no"))
437                                 arg_link_journal = LINK_NO;
438                         else if (streq(optarg, "guest"))
439                                 arg_link_journal = LINK_GUEST;
440                         else if (streq(optarg, "host"))
441                                 arg_link_journal = LINK_HOST;
442                         else {
443                                 log_error("Failed to parse link journal mode %s", optarg);
444                                 return -EINVAL;
445                         }
446
447                         break;
448
449                 case ARG_BIND:
450                 case ARG_BIND_RO: {
451                         _cleanup_free_ char *a = NULL, *b = NULL;
452                         char *e;
453                         char ***x;
454
455                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
456
457                         e = strchr(optarg, ':');
458                         if (e) {
459                                 a = strndup(optarg, e - optarg);
460                                 b = strdup(e + 1);
461                         } else {
462                                 a = strdup(optarg);
463                                 b = strdup(optarg);
464                         }
465
466                         if (!a || !b)
467                                 return log_oom();
468
469                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
470                                 log_error("Invalid bind mount specification: %s", optarg);
471                                 return -EINVAL;
472                         }
473
474                         r = strv_extend(x, a);
475                         if (r < 0)
476                                 return log_oom();
477
478                         r = strv_extend(x, b);
479                         if (r < 0)
480                                 return log_oom();
481
482                         break;
483                 }
484
485                 case ARG_TMPFS: {
486                         _cleanup_free_ char *a = NULL, *b = NULL;
487                         char *e;
488
489                         e = strchr(optarg, ':');
490                         if (e) {
491                                 a = strndup(optarg, e - optarg);
492                                 b = strdup(e + 1);
493                         } else {
494                                 a = strdup(optarg);
495                                 b = strdup("mode=0755");
496                         }
497
498                         if (!a || !b)
499                                 return log_oom();
500
501                         if (!path_is_absolute(a)) {
502                                 log_error("Invalid tmpfs specification: %s", optarg);
503                                 return -EINVAL;
504                         }
505
506                         r = strv_push(&arg_tmpfs, a);
507                         if (r < 0)
508                                 return log_oom();
509
510                         a = NULL;
511
512                         r = strv_push(&arg_tmpfs, b);
513                         if (r < 0)
514                                 return log_oom();
515
516                         b = NULL;
517
518                         break;
519                 }
520
521                 case ARG_SETENV: {
522                         char **n;
523
524                         if (!env_assignment_is_valid(optarg)) {
525                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
526                                 return -EINVAL;
527                         }
528
529                         n = strv_env_set(arg_setenv, optarg);
530                         if (!n)
531                                 return log_oom();
532
533                         strv_free(arg_setenv);
534                         arg_setenv = n;
535                         break;
536                 }
537
538                 case 'q':
539                         arg_quiet = true;
540                         break;
541
542                 case ARG_SHARE_SYSTEM:
543                         arg_share_system = true;
544                         break;
545
546                 case ARG_REGISTER:
547                         r = parse_boolean(optarg);
548                         if (r < 0) {
549                                 log_error("Failed to parse --register= argument: %s", optarg);
550                                 return r;
551                         }
552
553                         arg_register = r;
554                         break;
555
556                 case ARG_KEEP_UNIT:
557                         arg_keep_unit = true;
558                         break;
559
560                 case ARG_PERSONALITY:
561
562                         arg_personality = personality_from_string(optarg);
563                         if (arg_personality == 0xffffffffLU) {
564                                 log_error("Unknown or unsupported personality '%s'.", optarg);
565                                 return -EINVAL;
566                         }
567
568                         break;
569
570                 case ARG_VOLATILE:
571
572                         if (!optarg)
573                                 arg_volatile = VOLATILE_YES;
574                         else {
575                                 r = parse_boolean(optarg);
576                                 if (r < 0) {
577                                         if (streq(optarg, "state"))
578                                                 arg_volatile = VOLATILE_STATE;
579                                         else {
580                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
581                                                 return r;
582                                         }
583                                 } else
584                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
585                         }
586
587                         break;
588
589                 case '?':
590                         return -EINVAL;
591
592                 default:
593                         assert_not_reached("Unhandled option");
594                 }
595
596         if (arg_share_system)
597                 arg_register = false;
598
599         if (arg_boot && arg_share_system) {
600                 log_error("--boot and --share-system may not be combined.");
601                 return -EINVAL;
602         }
603
604         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
605                 log_error("--keep-unit may not be used when invoked from a user session.");
606                 return -EINVAL;
607         }
608
609         if (arg_directory && arg_image) {
610                 log_error("--directory= and --image= may not be combined.");
611                 return -EINVAL;
612         }
613
614         if (arg_volatile != VOLATILE_NO && arg_read_only) {
615                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
616                 return -EINVAL;
617         }
618
619         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
620
621         return 1;
622 }
623
624 static int mount_all(const char *dest) {
625
626         typedef struct MountPoint {
627                 const char *what;
628                 const char *where;
629                 const char *type;
630                 const char *options;
631                 unsigned long flags;
632                 bool fatal;
633         } MountPoint;
634
635         static const MountPoint mount_table[] = {
636                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
637                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
638                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
639                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
640                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
641                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
642                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
644 #ifdef HAVE_SELINUX
645                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
646                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
647 #endif
648         };
649
650         unsigned k;
651         int r = 0;
652
653         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
654                 _cleanup_free_ char *where = NULL;
655 #ifdef HAVE_SELINUX
656                 _cleanup_free_ char *options = NULL;
657 #endif
658                 const char *o;
659                 int t;
660
661                 where = strjoin(dest, "/", mount_table[k].where, NULL);
662                 if (!where)
663                         return log_oom();
664
665                 t = path_is_mount_point(where, true);
666                 if (t < 0) {
667                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
668
669                         if (r == 0)
670                                 r = t;
671
672                         continue;
673                 }
674
675                 /* Skip this entry if it is not a remount. */
676                 if (mount_table[k].what && t > 0)
677                         continue;
678
679                 t = mkdir_p(where, 0755);
680                 if (t < 0) {
681                         if (mount_table[k].fatal) {
682                                log_error("Failed to create directory %s: %s", where, strerror(-t));
683
684                                 if (r == 0)
685                                         r = t;
686                         } else
687                                log_warning("Failed to create directory %s: %s", where, strerror(-t));
688
689                         continue;
690                 }
691
692 #ifdef HAVE_SELINUX
693                 if (arg_selinux_apifs_context &&
694                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
695                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
696                         if (!options)
697                                 return log_oom();
698
699                         o = options;
700                 } else
701 #endif
702                         o = mount_table[k].options;
703
704
705                 if (mount(mount_table[k].what,
706                           where,
707                           mount_table[k].type,
708                           mount_table[k].flags,
709                           o) < 0) {
710
711                         if (mount_table[k].fatal) {
712                                 log_error("mount(%s) failed: %m", where);
713
714                                 if (r == 0)
715                                         r = -errno;
716                         } else
717                                 log_warning("mount(%s) failed: %m", where);
718                 }
719         }
720
721         return r;
722 }
723
724 static int mount_binds(const char *dest, char **l, bool ro) {
725         char **x, **y;
726
727         STRV_FOREACH_PAIR(x, y, l) {
728                 _cleanup_free_ char *where = NULL;
729                 struct stat source_st, dest_st;
730                 int r;
731
732                 if (stat(*x, &source_st) < 0) {
733                         log_error("Failed to stat %s: %m", *x);
734                         return -errno;
735                 }
736
737                 where = strappend(dest, *y);
738                 if (!where)
739                         return log_oom();
740
741                 r = stat(where, &dest_st);
742                 if (r == 0) {
743                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
744                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
745                                 return -EINVAL;
746                         }
747                 } else if (errno == ENOENT) {
748                         r = mkdir_parents_label(where, 0755);
749                         if (r < 0) {
750                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
751                                 return r;
752                         }
753                 } else {
754                         log_error("Failed to bind mount %s: %m", *x);
755                         return -errno;
756                 }
757
758                 /* Create the mount point, but be conservative -- refuse to create block
759                  * and char devices. */
760                 if (S_ISDIR(source_st.st_mode)) {
761                         r = mkdir_label(where, 0755);
762                         if (r < 0 && errno != EEXIST) {
763                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
764
765                                 return r;
766                         }
767                 } else if (S_ISFIFO(source_st.st_mode)) {
768                         r = mkfifo(where, 0644);
769                         if (r < 0 && errno != EEXIST) {
770                                 log_error("Failed to create mount point %s: %m", where);
771
772                                 return -errno;
773                         }
774                 } else if (S_ISSOCK(source_st.st_mode)) {
775                         r = mknod(where, 0644 | S_IFSOCK, 0);
776                         if (r < 0 && errno != EEXIST) {
777                                 log_error("Failed to create mount point %s: %m", where);
778
779                                 return -errno;
780                         }
781                 } else if (S_ISREG(source_st.st_mode)) {
782                         r = touch(where);
783                         if (r < 0) {
784                                 log_error("Failed to create mount point %s: %s", where, strerror(-r));
785
786                                 return r;
787                         }
788                 } else {
789                         log_error("Refusing to create mountpoint for file: %s", *x);
790                         return -ENOTSUP;
791                 }
792
793                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
794                         log_error("mount(%s) failed: %m", where);
795                         return -errno;
796                 }
797
798                 if (ro) {
799                         r = bind_remount_recursive(where, true);
800                         if (r < 0) {
801                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
802                                 return r;
803                         }
804                 }
805         }
806
807         return 0;
808 }
809
810 static int mount_tmpfs(const char *dest) {
811         char **i, **o;
812
813         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
814                 _cleanup_free_ char *where = NULL;
815                 int r;
816
817                 where = strappend(dest, *i);
818                 if (!where)
819                         return log_oom();
820
821                 r = mkdir_label(where, 0755);
822                 if (r < 0) {
823                         log_error("creating mount point for tmpfs %s failed: %s", where, strerror(-r));
824
825                         return r;
826                 }
827
828                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
829                         log_error("tmpfs mount to %s failed: %m", where);
830                         return -errno;
831                 }
832         }
833
834         return 0;
835 }
836
837 static int setup_timezone(const char *dest) {
838         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
839         char *z, *y;
840         int r;
841
842         assert(dest);
843
844         /* Fix the timezone, if possible */
845         r = readlink_malloc("/etc/localtime", &p);
846         if (r < 0) {
847                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
848                 return 0;
849         }
850
851         z = path_startswith(p, "../usr/share/zoneinfo/");
852         if (!z)
853                 z = path_startswith(p, "/usr/share/zoneinfo/");
854         if (!z) {
855                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
856                 return 0;
857         }
858
859         where = strappend(dest, "/etc/localtime");
860         if (!where)
861                 return log_oom();
862
863         r = readlink_malloc(where, &q);
864         if (r >= 0) {
865                 y = path_startswith(q, "../usr/share/zoneinfo/");
866                 if (!y)
867                         y = path_startswith(q, "/usr/share/zoneinfo/");
868
869                 /* Already pointing to the right place? Then do nothing .. */
870                 if (y && streq(y, z))
871                         return 0;
872         }
873
874         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
875         if (!check)
876                 return log_oom();
877
878         if (access(check, F_OK) < 0) {
879                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
880                 return 0;
881         }
882
883         what = strappend("../usr/share/zoneinfo/", z);
884         if (!what)
885                 return log_oom();
886
887         r = mkdir_parents(where, 0755);
888         if (r < 0) {
889                 log_error("Failed to create directory for timezone info %s in container: %s", where, strerror(-r));
890
891                 return 0;
892         }
893
894         r = unlink(where);
895         if (r < 0 && errno != ENOENT) {
896                 log_error("Failed to remove existing timezone info %s in container: %m", where);
897
898                 return 0;
899         }
900
901         if (symlink(what, where) < 0) {
902                 log_error("Failed to correct timezone of container: %m");
903                 return 0;
904         }
905
906         return 0;
907 }
908
909 static int setup_resolv_conf(const char *dest) {
910         _cleanup_free_ char *where = NULL;
911         int r;
912
913         assert(dest);
914
915         if (arg_private_network)
916                 return 0;
917
918         /* Fix resolv.conf, if possible */
919         where = strappend(dest, "/etc/resolv.conf");
920         if (!where)
921                 return log_oom();
922
923         /* We don't really care for the results of this really. If it
924          * fails, it fails, but meh... */
925         r = mkdir_parents(where, 0755);
926         if (r < 0) {
927                 log_warning("Failed to create parent directory for resolv.conf %s: %s", where, strerror(-r));
928
929                 return 0;
930         }
931
932         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
933         if (r < 0) {
934                 log_warning("Failed to copy /etc/resolv.conf to %s: %s", where, strerror(-r));
935
936                 return 0;
937         }
938
939         return 0;
940 }
941
942 static int setup_volatile_state(const char *directory) {
943         const char *p;
944         int r;
945
946         assert(directory);
947
948         if (arg_volatile != VOLATILE_STATE)
949                 return 0;
950
951         /* --volatile=state means we simply overmount /var
952            with a tmpfs, and the rest read-only. */
953
954         r = bind_remount_recursive(directory, true);
955         if (r < 0) {
956                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
957                 return r;
958         }
959
960         p = strappenda(directory, "/var");
961         r = mkdir(p, 0755);
962         if (r < 0 && errno != EEXIST) {
963                 log_error("Failed to create %s: %m", directory);
964                 return -errno;
965         }
966
967         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
968                 log_error("Failed to mount tmpfs to /var: %m");
969                 return -errno;
970         }
971
972         return 0;
973 }
974
975 static int setup_volatile(const char *directory) {
976         bool tmpfs_mounted = false, bind_mounted = false;
977         char template[] = "/tmp/nspawn-volatile-XXXXXX";
978         const char *f, *t;
979         int r;
980
981         assert(directory);
982
983         if (arg_volatile != VOLATILE_YES)
984                 return 0;
985
986         /* --volatile=yes means we mount a tmpfs to the root dir, and
987            the original /usr to use inside it, and that read-only. */
988
989         if (!mkdtemp(template)) {
990                 log_error("Failed to create temporary directory: %m");
991                 return -errno;
992         }
993
994         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
995                 log_error("Failed to mount tmpfs for root directory: %m");
996                 r = -errno;
997                 goto fail;
998         }
999
1000         tmpfs_mounted = true;
1001
1002         f = strappenda(directory, "/usr");
1003         t = strappenda(template, "/usr");
1004
1005         r = mkdir(t, 0755);
1006         if (r < 0 && errno != EEXIST) {
1007                 log_error("Failed to create %s: %m", t);
1008                 r = -errno;
1009                 goto fail;
1010         }
1011
1012         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1013                 log_error("Failed to create /usr bind mount: %m");
1014                 r = -errno;
1015                 goto fail;
1016         }
1017
1018         bind_mounted = true;
1019
1020         r = bind_remount_recursive(t, true);
1021         if (r < 0) {
1022                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
1023                 goto fail;
1024         }
1025
1026         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1027                 log_error("Failed to move root mount: %m");
1028                 r = -errno;
1029                 goto fail;
1030         }
1031
1032         rmdir(template);
1033
1034         return 0;
1035
1036 fail:
1037         if (bind_mounted)
1038                 umount(t);
1039         if (tmpfs_mounted)
1040                 umount(template);
1041         rmdir(template);
1042         return r;
1043 }
1044
1045 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1046
1047         snprintf(s, 37,
1048                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1049                  SD_ID128_FORMAT_VAL(id));
1050
1051         return s;
1052 }
1053
1054 static int setup_boot_id(const char *dest) {
1055         _cleanup_free_ char *from = NULL, *to = NULL;
1056         sd_id128_t rnd = {};
1057         char as_uuid[37];
1058         int r;
1059
1060         assert(dest);
1061
1062         if (arg_share_system)
1063                 return 0;
1064
1065         /* Generate a new randomized boot ID, so that each boot-up of
1066          * the container gets a new one */
1067
1068         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1069         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1070         if (!from || !to)
1071                 return log_oom();
1072
1073         r = sd_id128_randomize(&rnd);
1074         if (r < 0) {
1075                 log_error("Failed to generate random boot id: %s", strerror(-r));
1076                 return r;
1077         }
1078
1079         id128_format_as_uuid(rnd, as_uuid);
1080
1081         r = write_string_file(from, as_uuid);
1082         if (r < 0) {
1083                 log_error("Failed to write boot id: %s", strerror(-r));
1084                 return r;
1085         }
1086
1087         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1088                 log_error("Failed to bind mount boot id: %m");
1089                 r = -errno;
1090         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1091                 log_warning("Failed to make boot id read-only: %m");
1092
1093         unlink(from);
1094         return r;
1095 }
1096
1097 static int copy_devnodes(const char *dest) {
1098
1099         static const char devnodes[] =
1100                 "null\0"
1101                 "zero\0"
1102                 "full\0"
1103                 "random\0"
1104                 "urandom\0"
1105                 "tty\0"
1106                 "net/tun\0";
1107
1108         const char *d;
1109         int r = 0;
1110         _cleanup_umask_ mode_t u;
1111
1112         assert(dest);
1113
1114         u = umask(0000);
1115
1116         NULSTR_FOREACH(d, devnodes) {
1117                 _cleanup_free_ char *from = NULL, *to = NULL;
1118                 struct stat st;
1119
1120                 from = strappend("/dev/", d);
1121                 to = strjoin(dest, "/dev/", d, NULL);
1122                 if (!from || !to)
1123                         return log_oom();
1124
1125                 if (stat(from, &st) < 0) {
1126
1127                         if (errno != ENOENT) {
1128                                 log_error("Failed to stat %s: %m", from);
1129                                 return -errno;
1130                         }
1131
1132                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1133
1134                         log_error("%s is not a char or block device, cannot copy", from);
1135                         return -EIO;
1136
1137                 } else {
1138                         r = mkdir_parents(to, 0775);
1139                         if (r < 0) {
1140                                 log_error("Failed to create parent directory of %s: %s", to, strerror(-r));
1141                                 return -r;
1142                         }
1143
1144                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1145                                 log_error("mknod(%s) failed: %m", dest);
1146                                 return  -errno;
1147                         }
1148                 }
1149         }
1150
1151         return r;
1152 }
1153
1154 static int setup_ptmx(const char *dest) {
1155         _cleanup_free_ char *p = NULL;
1156
1157         p = strappend(dest, "/dev/ptmx");
1158         if (!p)
1159                 return log_oom();
1160
1161         if (symlink("pts/ptmx", p) < 0) {
1162                 log_error("Failed to create /dev/ptmx symlink: %m");
1163                 return -errno;
1164         }
1165
1166         return 0;
1167 }
1168
1169 static int setup_dev_console(const char *dest, const char *console) {
1170         _cleanup_umask_ mode_t u;
1171         const char *to;
1172         struct stat st;
1173         int r;
1174
1175         assert(dest);
1176         assert(console);
1177
1178         u = umask(0000);
1179
1180         if (stat("/dev/null", &st) < 0) {
1181                 log_error("Failed to stat /dev/null: %m");
1182                 return -errno;
1183         }
1184
1185         r = chmod_and_chown(console, 0600, 0, 0);
1186         if (r < 0) {
1187                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1188                 return r;
1189         }
1190
1191         /* We need to bind mount the right tty to /dev/console since
1192          * ptys can only exist on pts file systems. To have something
1193          * to bind mount things on we create a device node first, and
1194          * use /dev/null for that since we the cgroups device policy
1195          * allows us to create that freely, while we cannot create
1196          * /dev/console. (Note that the major minor doesn't actually
1197          * matter here, since we mount it over anyway). */
1198
1199         to = strappenda(dest, "/dev/console");
1200         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1201                 log_error("mknod() for /dev/console failed: %m");
1202                 return -errno;
1203         }
1204
1205         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1206                 log_error("Bind mount for /dev/console failed: %m");
1207                 return -errno;
1208         }
1209
1210         return 0;
1211 }
1212
1213 static int setup_kmsg(const char *dest, int kmsg_socket) {
1214         _cleanup_free_ char *from = NULL, *to = NULL;
1215         int r, fd, k;
1216         _cleanup_umask_ mode_t u;
1217         union {
1218                 struct cmsghdr cmsghdr;
1219                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1220         } control = {};
1221         struct msghdr mh = {
1222                 .msg_control = &control,
1223                 .msg_controllen = sizeof(control),
1224         };
1225         struct cmsghdr *cmsg;
1226
1227         assert(dest);
1228         assert(kmsg_socket >= 0);
1229
1230         u = umask(0000);
1231
1232         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1233          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1234          * on the reading side behave very similar to /proc/kmsg,
1235          * their writing side behaves differently from /dev/kmsg in
1236          * that writing blocks when nothing is reading. In order to
1237          * avoid any problems with containers deadlocking due to this
1238          * we simply make /dev/kmsg unavailable to the container. */
1239         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1240             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1241                 return log_oom();
1242
1243         if (mkfifo(from, 0600) < 0) {
1244                 log_error("mkfifo() for /dev/kmsg failed: %m");
1245                 return -errno;
1246         }
1247
1248         r = chmod_and_chown(from, 0600, 0, 0);
1249         if (r < 0) {
1250                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1251                 return r;
1252         }
1253
1254         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1255                 log_error("Bind mount for /proc/kmsg failed: %m");
1256                 return -errno;
1257         }
1258
1259         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1260         if (fd < 0) {
1261                 log_error("Failed to open fifo: %m");
1262                 return -errno;
1263         }
1264
1265         cmsg = CMSG_FIRSTHDR(&mh);
1266         cmsg->cmsg_level = SOL_SOCKET;
1267         cmsg->cmsg_type = SCM_RIGHTS;
1268         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1269         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1270
1271         mh.msg_controllen = cmsg->cmsg_len;
1272
1273         /* Store away the fd in the socket, so that it stays open as
1274          * long as we run the child */
1275         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1276         safe_close(fd);
1277
1278         if (k < 0) {
1279                 log_error("Failed to send FIFO fd: %m");
1280                 return -errno;
1281         }
1282
1283         /* And now make the FIFO unavailable as /dev/kmsg... */
1284         unlink(from);
1285         return 0;
1286 }
1287
1288 static int setup_hostname(void) {
1289
1290         if (arg_share_system)
1291                 return 0;
1292
1293         if (sethostname_idempotent(arg_machine) < 0)
1294                 return -errno;
1295
1296         return 0;
1297 }
1298
1299 static int setup_journal(const char *directory) {
1300         sd_id128_t machine_id, this_id;
1301         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1302         char *id;
1303         int r;
1304
1305         p = strappend(directory, "/etc/machine-id");
1306         if (!p)
1307                 return log_oom();
1308
1309         r = read_one_line_file(p, &b);
1310         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1311                 return 0;
1312         else if (r < 0) {
1313                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1314                 return r;
1315         }
1316
1317         id = strstrip(b);
1318         if (isempty(id) && arg_link_journal == LINK_AUTO)
1319                 return 0;
1320
1321         /* Verify validity */
1322         r = sd_id128_from_string(id, &machine_id);
1323         if (r < 0) {
1324                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1325                 return r;
1326         }
1327
1328         r = sd_id128_get_machine(&this_id);
1329         if (r < 0) {
1330                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1331                 return r;
1332         }
1333
1334         if (sd_id128_equal(machine_id, this_id)) {
1335                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1336                          "Host and machine ids are equal (%s): refusing to link journals", id);
1337                 if (arg_link_journal == LINK_AUTO)
1338                         return 0;
1339                 return
1340                         -EEXIST;
1341         }
1342
1343         if (arg_link_journal == LINK_NO)
1344                 return 0;
1345
1346         free(p);
1347         p = strappend("/var/log/journal/", id);
1348         q = strjoin(directory, "/var/log/journal/", id, NULL);
1349         if (!p || !q)
1350                 return log_oom();
1351
1352         if (path_is_mount_point(p, false) > 0) {
1353                 if (arg_link_journal != LINK_AUTO) {
1354                         log_error("%s: already a mount point, refusing to use for journal", p);
1355                         return -EEXIST;
1356                 }
1357
1358                 return 0;
1359         }
1360
1361         if (path_is_mount_point(q, false) > 0) {
1362                 if (arg_link_journal != LINK_AUTO) {
1363                         log_error("%s: already a mount point, refusing to use for journal", q);
1364                         return -EEXIST;
1365                 }
1366
1367                 return 0;
1368         }
1369
1370         r = readlink_and_make_absolute(p, &d);
1371         if (r >= 0) {
1372                 if ((arg_link_journal == LINK_GUEST ||
1373                      arg_link_journal == LINK_AUTO) &&
1374                     path_equal(d, q)) {
1375
1376                         r = mkdir_p(q, 0755);
1377                         if (r < 0)
1378                                 log_warning("Failed to create directory %s: %m", q);
1379                         return 0;
1380                 }
1381
1382                 if (unlink(p) < 0) {
1383                         log_error("Failed to remove symlink %s: %m", p);
1384                         return -errno;
1385                 }
1386         } else if (r == -EINVAL) {
1387
1388                 if (arg_link_journal == LINK_GUEST &&
1389                     rmdir(p) < 0) {
1390
1391                         if (errno == ENOTDIR) {
1392                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1393                                 return r;
1394                         } else {
1395                                 log_error("Failed to remove %s: %m", p);
1396                                 return -errno;
1397                         }
1398                 }
1399         } else if (r != -ENOENT) {
1400                 log_error("readlink(%s) failed: %m", p);
1401                 return r;
1402         }
1403
1404         if (arg_link_journal == LINK_GUEST) {
1405
1406                 if (symlink(q, p) < 0) {
1407                         log_error("Failed to symlink %s to %s: %m", q, p);
1408                         return -errno;
1409                 }
1410
1411                 r = mkdir_p(q, 0755);
1412                 if (r < 0)
1413                         log_warning("Failed to create directory %s: %m", q);
1414                 return 0;
1415         }
1416
1417         if (arg_link_journal == LINK_HOST) {
1418                 r = mkdir_p(p, 0755);
1419                 if (r < 0) {
1420                         log_error("Failed to create %s: %m", p);
1421                         return r;
1422                 }
1423
1424         } else if (access(p, F_OK) < 0)
1425                 return 0;
1426
1427         if (dir_is_empty(q) == 0)
1428                 log_warning("%s is not empty, proceeding anyway.", q);
1429
1430         r = mkdir_p(q, 0755);
1431         if (r < 0) {
1432                 log_error("Failed to create %s: %m", q);
1433                 return r;
1434         }
1435
1436         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1437                 log_error("Failed to bind mount journal from host into guest: %m");
1438                 return -errno;
1439         }
1440
1441         return 0;
1442 }
1443
1444 static int setup_kdbus(const char *dest, const char *path) {
1445         const char *p;
1446
1447         if (!path)
1448                 return 0;
1449
1450         p = strappenda(dest, "/dev/kdbus");
1451         if (mkdir(p, 0755) < 0) {
1452                 log_error("Failed to create kdbus path: %m");
1453                 return  -errno;
1454         }
1455
1456         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1457                 log_error("Failed to mount kdbus domain path: %m");
1458                 return -errno;
1459         }
1460
1461         return 0;
1462 }
1463
1464 static int drop_capabilities(void) {
1465         return capability_bounding_set_drop(~arg_retain, false);
1466 }
1467
1468 static int register_machine(pid_t pid, int local_ifindex) {
1469         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1470         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1471         int r;
1472
1473         if (!arg_register)
1474                 return 0;
1475
1476         r = sd_bus_default_system(&bus);
1477         if (r < 0) {
1478                 log_error("Failed to open system bus: %s", strerror(-r));
1479                 return r;
1480         }
1481
1482         if (arg_keep_unit) {
1483                 r = sd_bus_call_method(
1484                                 bus,
1485                                 "org.freedesktop.machine1",
1486                                 "/org/freedesktop/machine1",
1487                                 "org.freedesktop.machine1.Manager",
1488                                 "RegisterMachineWithNetwork",
1489                                 &error,
1490                                 NULL,
1491                                 "sayssusai",
1492                                 arg_machine,
1493                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1494                                 "nspawn",
1495                                 "container",
1496                                 (uint32_t) pid,
1497                                 strempty(arg_directory),
1498                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1499         } else {
1500                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1501
1502                 r = sd_bus_message_new_method_call(
1503                                 bus,
1504                                 &m,
1505                                 "org.freedesktop.machine1",
1506                                 "/org/freedesktop/machine1",
1507                                 "org.freedesktop.machine1.Manager",
1508                                 "CreateMachineWithNetwork");
1509                 if (r < 0) {
1510                         log_error("Failed to create message: %s", strerror(-r));
1511                         return r;
1512                 }
1513
1514                 r = sd_bus_message_append(
1515                                 m,
1516                                 "sayssusai",
1517                                 arg_machine,
1518                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1519                                 "nspawn",
1520                                 "container",
1521                                 (uint32_t) pid,
1522                                 strempty(arg_directory),
1523                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1524                 if (r < 0) {
1525                         log_error("Failed to append message arguments: %s", strerror(-r));
1526                         return r;
1527                 }
1528
1529                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1530                 if (r < 0) {
1531                         log_error("Failed to open container: %s", strerror(-r));
1532                         return r;
1533                 }
1534
1535                 if (!isempty(arg_slice)) {
1536                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1537                         if (r < 0) {
1538                                 log_error("Failed to append slice: %s", strerror(-r));
1539                                 return r;
1540                         }
1541                 }
1542
1543                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1544                 if (r < 0) {
1545                         log_error("Failed to add device policy: %s", strerror(-r));
1546                         return r;
1547                 }
1548
1549                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 11,
1550                                           /* Allow the container to
1551                                            * access and create the API
1552                                            * device nodes, so that
1553                                            * PrivateDevices= in the
1554                                            * container can work
1555                                            * fine */
1556                                           "/dev/null", "rwm",
1557                                           "/dev/zero", "rwm",
1558                                           "/dev/full", "rwm",
1559                                           "/dev/random", "rwm",
1560                                           "/dev/urandom", "rwm",
1561                                           "/dev/tty", "rwm",
1562                                           "/dev/net/tun", "rwm",
1563                                           /* Allow the container
1564                                            * access to ptys. However,
1565                                            * do not permit the
1566                                            * container to ever create
1567                                            * these device nodes. */
1568                                           "/dev/pts/ptmx", "rw",
1569                                           "char-pts", "rw",
1570                                           /* Allow the container
1571                                            * access to all kdbus
1572                                            * devices. Again, the
1573                                            * container cannot create
1574                                            * these nodes, only use
1575                                            * them. We use a pretty
1576                                            * open match here, so that
1577                                            * the kernel API can still
1578                                            * change. */
1579                                           "char-kdbus", "rw",
1580                                           "char-kdbus/*", "rw");
1581                 if (r < 0) {
1582                         log_error("Failed to add device whitelist: %s", strerror(-r));
1583                         return r;
1584                 }
1585
1586                 r = sd_bus_message_close_container(m);
1587                 if (r < 0) {
1588                         log_error("Failed to close container: %s", strerror(-r));
1589                         return r;
1590                 }
1591
1592                 r = sd_bus_call(bus, m, 0, &error, NULL);
1593         }
1594
1595         if (r < 0) {
1596                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1597                 return r;
1598         }
1599
1600         return 0;
1601 }
1602
1603 static int terminate_machine(pid_t pid) {
1604         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1605         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1606         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1607         const char *path;
1608         int r;
1609
1610         if (!arg_register)
1611                 return 0;
1612
1613         r = sd_bus_default_system(&bus);
1614         if (r < 0) {
1615                 log_error("Failed to open system bus: %s", strerror(-r));
1616                 return r;
1617         }
1618
1619         r = sd_bus_call_method(
1620                         bus,
1621                         "org.freedesktop.machine1",
1622                         "/org/freedesktop/machine1",
1623                         "org.freedesktop.machine1.Manager",
1624                         "GetMachineByPID",
1625                         &error,
1626                         &reply,
1627                         "u",
1628                         (uint32_t) pid);
1629         if (r < 0) {
1630                 /* Note that the machine might already have been
1631                  * cleaned up automatically, hence don't consider it a
1632                  * failure if we cannot get the machine object. */
1633                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1634                 return 0;
1635         }
1636
1637         r = sd_bus_message_read(reply, "o", &path);
1638         if (r < 0)
1639                 return bus_log_parse_error(r);
1640
1641         r = sd_bus_call_method(
1642                         bus,
1643                         "org.freedesktop.machine1",
1644                         path,
1645                         "org.freedesktop.machine1.Machine",
1646                         "Terminate",
1647                         &error,
1648                         NULL,
1649                         NULL);
1650         if (r < 0) {
1651                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1652                 return 0;
1653         }
1654
1655         return 0;
1656 }
1657
1658 static int reset_audit_loginuid(void) {
1659         _cleanup_free_ char *p = NULL;
1660         int r;
1661
1662         if (arg_share_system)
1663                 return 0;
1664
1665         r = read_one_line_file("/proc/self/loginuid", &p);
1666         if (r == -ENOENT)
1667                 return 0;
1668         if (r < 0) {
1669                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1670                 return r;
1671         }
1672
1673         /* Already reset? */
1674         if (streq(p, "4294967295"))
1675                 return 0;
1676
1677         r = write_string_file("/proc/self/loginuid", "4294967295");
1678         if (r < 0) {
1679                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1680                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1681                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1682                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1683                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1684
1685                 sleep(5);
1686         }
1687
1688         return 0;
1689 }
1690
1691 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1692 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1693
1694 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1695         int r;
1696
1697         uint8_t result[8];
1698         size_t l, sz;
1699         uint8_t *v;
1700
1701         l = strlen(arg_machine);
1702         sz = sizeof(sd_id128_t) + l;
1703         v = alloca(sz);
1704
1705         /* fetch some persistent data unique to the host */
1706         r = sd_id128_get_machine((sd_id128_t*) v);
1707         if (r < 0)
1708                 return r;
1709
1710         /* combine with some data unique (on this host) to this
1711          * container instance */
1712         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1713
1714         /* Let's hash the host machine ID plus the container name. We
1715          * use a fixed, but originally randomly created hash key here. */
1716         siphash24(result, v, sz, hash_key.bytes);
1717
1718         assert_cc(ETH_ALEN <= sizeof(result));
1719         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1720
1721         /* see eth_random_addr in the kernel */
1722         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1723         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1724
1725         return 0;
1726 }
1727
1728 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1729         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1730         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1731         struct ether_addr mac_host, mac_container;
1732         int r, i;
1733
1734         if (!arg_private_network)
1735                 return 0;
1736
1737         if (!arg_network_veth)
1738                 return 0;
1739
1740         /* Use two different interface name prefixes depending whether
1741          * we are in bridge mode or not. */
1742         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1743                  arg_network_bridge ? "vb" : "ve", arg_machine);
1744
1745         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1746         if (r < 0) {
1747                 log_error("Failed to generate predictable MAC address for container side");
1748                 return r;
1749         }
1750
1751         r = generate_mac(&mac_host, HOST_HASH_KEY);
1752         if (r < 0) {
1753                 log_error("Failed to generate predictable MAC address for host side");
1754                 return r;
1755         }
1756
1757         r = sd_rtnl_open(&rtnl, 0);
1758         if (r < 0) {
1759                 log_error("Failed to connect to netlink: %s", strerror(-r));
1760                 return r;
1761         }
1762
1763         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1764         if (r < 0) {
1765                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1766                 return r;
1767         }
1768
1769         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1770         if (r < 0) {
1771                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1772                 return r;
1773         }
1774
1775         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1776         if (r < 0) {
1777                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1778                 return r;
1779         }
1780
1781         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1782         if (r < 0) {
1783                 log_error("Failed to open netlink container: %s", strerror(-r));
1784                 return r;
1785         }
1786
1787         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1788         if (r < 0) {
1789                 log_error("Failed to open netlink container: %s", strerror(-r));
1790                 return r;
1791         }
1792
1793         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1794         if (r < 0) {
1795                 log_error("Failed to open netlink container: %s", strerror(-r));
1796                 return r;
1797         }
1798
1799         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1800         if (r < 0) {
1801                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1802                 return r;
1803         }
1804
1805         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1806         if (r < 0) {
1807                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1808                 return r;
1809         }
1810
1811         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1812         if (r < 0) {
1813                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1814                 return r;
1815         }
1816
1817         r = sd_rtnl_message_close_container(m);
1818         if (r < 0) {
1819                 log_error("Failed to close netlink container: %s", strerror(-r));
1820                 return r;
1821         }
1822
1823         r = sd_rtnl_message_close_container(m);
1824         if (r < 0) {
1825                 log_error("Failed to close netlink container: %s", strerror(-r));
1826                 return r;
1827         }
1828
1829         r = sd_rtnl_message_close_container(m);
1830         if (r < 0) {
1831                 log_error("Failed to close netlink container: %s", strerror(-r));
1832                 return r;
1833         }
1834
1835         r = sd_rtnl_call(rtnl, m, 0, NULL);
1836         if (r < 0) {
1837                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1838                 return r;
1839         }
1840
1841         i = (int) if_nametoindex(iface_name);
1842         if (i <= 0) {
1843                 log_error("Failed to resolve interface %s: %m", iface_name);
1844                 return -errno;
1845         }
1846
1847         *ifi = i;
1848
1849         return 0;
1850 }
1851
1852 static int setup_bridge(const char veth_name[], int *ifi) {
1853         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1854         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1855         int r, bridge;
1856
1857         if (!arg_private_network)
1858                 return 0;
1859
1860         if (!arg_network_veth)
1861                 return 0;
1862
1863         if (!arg_network_bridge)
1864                 return 0;
1865
1866         bridge = (int) if_nametoindex(arg_network_bridge);
1867         if (bridge <= 0) {
1868                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1869                 return -errno;
1870         }
1871
1872         *ifi = bridge;
1873
1874         r = sd_rtnl_open(&rtnl, 0);
1875         if (r < 0) {
1876                 log_error("Failed to connect to netlink: %s", strerror(-r));
1877                 return r;
1878         }
1879
1880         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1881         if (r < 0) {
1882                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1883                 return r;
1884         }
1885
1886         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1887         if (r < 0) {
1888                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1889                 return r;
1890         }
1891
1892         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1893         if (r < 0) {
1894                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1895                 return r;
1896         }
1897
1898         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1899         if (r < 0) {
1900                 log_error("Failed to add netlink master field: %s", strerror(-r));
1901                 return r;
1902         }
1903
1904         r = sd_rtnl_call(rtnl, m, 0, NULL);
1905         if (r < 0) {
1906                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1907                 return r;
1908         }
1909
1910         return 0;
1911 }
1912
1913 static int parse_interface(struct udev *udev, const char *name) {
1914         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1915         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1916         int ifi;
1917
1918         ifi = (int) if_nametoindex(name);
1919         if (ifi <= 0) {
1920                 log_error("Failed to resolve interface %s: %m", name);
1921                 return -errno;
1922         }
1923
1924         sprintf(ifi_str, "n%i", ifi);
1925         d = udev_device_new_from_device_id(udev, ifi_str);
1926         if (!d) {
1927                 log_error("Failed to get udev device for interface %s: %m", name);
1928                 return -errno;
1929         }
1930
1931         if (udev_device_get_is_initialized(d) <= 0) {
1932                 log_error("Network interface %s is not initialized yet.", name);
1933                 return -EBUSY;
1934         }
1935
1936         return ifi;
1937 }
1938
1939 static int move_network_interfaces(pid_t pid) {
1940         _cleanup_udev_unref_ struct udev *udev = NULL;
1941         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1942         char **i;
1943         int r;
1944
1945         if (!arg_private_network)
1946                 return 0;
1947
1948         if (strv_isempty(arg_network_interfaces))
1949                 return 0;
1950
1951         r = sd_rtnl_open(&rtnl, 0);
1952         if (r < 0) {
1953                 log_error("Failed to connect to netlink: %s", strerror(-r));
1954                 return r;
1955         }
1956
1957         udev = udev_new();
1958         if (!udev) {
1959                 log_error("Failed to connect to udev.");
1960                 return -ENOMEM;
1961         }
1962
1963         STRV_FOREACH(i, arg_network_interfaces) {
1964                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1965                 int ifi;
1966
1967                 ifi = parse_interface(udev, *i);
1968                 if (ifi < 0)
1969                         return ifi;
1970
1971                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1972                 if (r < 0) {
1973                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1974                         return r;
1975                 }
1976
1977                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1978                 if (r < 0) {
1979                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1980                         return r;
1981                 }
1982
1983                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1984                 if (r < 0) {
1985                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1986                         return r;
1987                 }
1988         }
1989
1990         return 0;
1991 }
1992
1993 static int setup_macvlan(pid_t pid) {
1994         _cleanup_udev_unref_ struct udev *udev = NULL;
1995         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1996         char **i;
1997         int r;
1998
1999         if (!arg_private_network)
2000                 return 0;
2001
2002         if (strv_isempty(arg_network_macvlan))
2003                 return 0;
2004
2005         r = sd_rtnl_open(&rtnl, 0);
2006         if (r < 0) {
2007                 log_error("Failed to connect to netlink: %s", strerror(-r));
2008                 return r;
2009         }
2010
2011         udev = udev_new();
2012         if (!udev) {
2013                 log_error("Failed to connect to udev.");
2014                 return -ENOMEM;
2015         }
2016
2017         STRV_FOREACH(i, arg_network_macvlan) {
2018                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
2019                 _cleanup_free_ char *n = NULL;
2020                 int ifi;
2021
2022                 ifi = parse_interface(udev, *i);
2023                 if (ifi < 0)
2024                         return ifi;
2025
2026                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
2027                 if (r < 0) {
2028                         log_error("Failed to allocate netlink message: %s", strerror(-r));
2029                         return r;
2030                 }
2031
2032                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
2033                 if (r < 0) {
2034                         log_error("Failed to add netlink interface index: %s", strerror(-r));
2035                         return r;
2036                 }
2037
2038                 n = strappend("mv-", *i);
2039                 if (!n)
2040                         return log_oom();
2041
2042                 strshorten(n, IFNAMSIZ-1);
2043
2044                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
2045                 if (r < 0) {
2046                         log_error("Failed to add netlink interface name: %s", strerror(-r));
2047                         return r;
2048                 }
2049
2050                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
2051                 if (r < 0) {
2052                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
2053                         return r;
2054                 }
2055
2056                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
2057                 if (r < 0) {
2058                         log_error("Failed to open netlink container: %s", strerror(-r));
2059                         return r;
2060                 }
2061
2062                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
2063                 if (r < 0) {
2064                         log_error("Failed to open netlink container: %s", strerror(-r));
2065                         return r;
2066                 }
2067
2068                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
2069                 if (r < 0) {
2070                         log_error("Failed to append macvlan mode: %s", strerror(-r));
2071                         return r;
2072                 }
2073
2074                 r = sd_rtnl_message_close_container(m);
2075                 if (r < 0) {
2076                         log_error("Failed to close netlink container: %s", strerror(-r));
2077                         return r;
2078                 }
2079
2080                 r = sd_rtnl_message_close_container(m);
2081                 if (r < 0) {
2082                         log_error("Failed to close netlink container: %s", strerror(-r));
2083                         return r;
2084                 }
2085
2086                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2087                 if (r < 0) {
2088                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2089                         return r;
2090                 }
2091         }
2092
2093         return 0;
2094 }
2095
2096 static int setup_seccomp(void) {
2097
2098 #ifdef HAVE_SECCOMP
2099         static const int blacklist[] = {
2100                 SCMP_SYS(kexec_load),
2101                 SCMP_SYS(open_by_handle_at),
2102                 SCMP_SYS(init_module),
2103                 SCMP_SYS(finit_module),
2104                 SCMP_SYS(delete_module),
2105                 SCMP_SYS(iopl),
2106                 SCMP_SYS(ioperm),
2107                 SCMP_SYS(swapon),
2108                 SCMP_SYS(swapoff),
2109         };
2110
2111         scmp_filter_ctx seccomp;
2112         unsigned i;
2113         int r;
2114
2115         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2116         if (!seccomp)
2117                 return log_oom();
2118
2119         r = seccomp_add_secondary_archs(seccomp);
2120         if (r < 0) {
2121                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2122                 goto finish;
2123         }
2124
2125         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2126                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2127                 if (r == -EFAULT)
2128                         continue; /* unknown syscall */
2129                 if (r < 0) {
2130                         log_error("Failed to block syscall: %s", strerror(-r));
2131                         goto finish;
2132                 }
2133         }
2134
2135         /*
2136            Audit is broken in containers, much of the userspace audit
2137            hookup will fail if running inside a container. We don't
2138            care and just turn off creation of audit sockets.
2139
2140            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2141            with EAFNOSUPPORT which audit userspace uses as indication
2142            that audit is disabled in the kernel.
2143          */
2144
2145         r = seccomp_rule_add(
2146                         seccomp,
2147                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2148                         SCMP_SYS(socket),
2149                         2,
2150                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2151                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2152         if (r < 0) {
2153                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2154                 goto finish;
2155         }
2156
2157         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2158         if (r < 0) {
2159                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2160                 goto finish;
2161         }
2162
2163         r = seccomp_load(seccomp);
2164         if (r < 0)
2165                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2166
2167 finish:
2168         seccomp_release(seccomp);
2169         return r;
2170 #else
2171         return 0;
2172 #endif
2173
2174 }
2175
2176 static int setup_image(char **device_path, int *loop_nr) {
2177         struct loop_info64 info = {
2178                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2179         };
2180         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2181         _cleanup_free_ char* loopdev = NULL;
2182         struct stat st;
2183         int r, nr;
2184
2185         assert(device_path);
2186         assert(loop_nr);
2187
2188         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2189         if (fd < 0) {
2190                 log_error("Failed to open %s: %m", arg_image);
2191                 return -errno;
2192         }
2193
2194         if (fstat(fd, &st) < 0) {
2195                 log_error("Failed to stat %s: %m", arg_image);
2196                 return -errno;
2197         }
2198
2199         if (S_ISBLK(st.st_mode)) {
2200                 char *p;
2201
2202                 p = strdup(arg_image);
2203                 if (!p)
2204                         return log_oom();
2205
2206                 *device_path = p;
2207
2208                 *loop_nr = -1;
2209
2210                 r = fd;
2211                 fd = -1;
2212
2213                 return r;
2214         }
2215
2216         if (!S_ISREG(st.st_mode)) {
2217                 log_error("%s is not a regular file or block device: %m", arg_image);
2218                 return -EINVAL;
2219         }
2220
2221         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2222         if (control < 0) {
2223                 log_error("Failed to open /dev/loop-control: %m");
2224                 return -errno;
2225         }
2226
2227         nr = ioctl(control, LOOP_CTL_GET_FREE);
2228         if (nr < 0) {
2229                 log_error("Failed to allocate loop device: %m");
2230                 return -errno;
2231         }
2232
2233         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2234                 return log_oom();
2235
2236         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2237         if (loop < 0) {
2238                 log_error("Failed to open loop device %s: %m", loopdev);
2239                 return -errno;
2240         }
2241
2242         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2243                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2244                 return -errno;
2245         }
2246
2247         if (arg_read_only)
2248                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2249
2250         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2251                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2252                 return -errno;
2253         }
2254
2255         *device_path = loopdev;
2256         loopdev = NULL;
2257
2258         *loop_nr = nr;
2259
2260         r = loop;
2261         loop = -1;
2262
2263         return r;
2264 }
2265
2266 static int dissect_image(
2267                 int fd,
2268                 char **root_device, bool *root_device_rw,
2269                 char **home_device, bool *home_device_rw,
2270                 char **srv_device, bool *srv_device_rw,
2271                 bool *secondary) {
2272
2273 #ifdef HAVE_BLKID
2274         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2275         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2276         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2277         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2278         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2279         _cleanup_udev_unref_ struct udev *udev = NULL;
2280         struct udev_list_entry *first, *item;
2281         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2282         const char *pttype = NULL;
2283         blkid_partlist pl;
2284         struct stat st;
2285         int r;
2286
2287         assert(fd >= 0);
2288         assert(root_device);
2289         assert(home_device);
2290         assert(srv_device);
2291         assert(secondary);
2292
2293         b = blkid_new_probe();
2294         if (!b)
2295                 return log_oom();
2296
2297         errno = 0;
2298         r = blkid_probe_set_device(b, fd, 0, 0);
2299         if (r != 0) {
2300                 if (errno == 0)
2301                         return log_oom();
2302
2303                 log_error("Failed to set device on blkid probe: %m");
2304                 return -errno;
2305         }
2306
2307         blkid_probe_enable_partitions(b, 1);
2308         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2309
2310         errno = 0;
2311         r = blkid_do_safeprobe(b);
2312         if (r == -2 || r == 1) {
2313                 log_error("Failed to identify any partition table on %s.\n"
2314                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2315                 return -EINVAL;
2316         } else if (r != 0) {
2317                 if (errno == 0)
2318                         errno = EIO;
2319                 log_error("Failed to probe: %m");
2320                 return -errno;
2321         }
2322
2323         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2324         if (!streq_ptr(pttype, "gpt")) {
2325                 log_error("Image %s does not carry a GUID Partition Table.\n"
2326                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2327                 return -EINVAL;
2328         }
2329
2330         errno = 0;
2331         pl = blkid_probe_get_partitions(b);
2332         if (!pl) {
2333                 if (errno == 0)
2334                         return log_oom();
2335
2336                 log_error("Failed to list partitions of %s", arg_image);
2337                 return -errno;
2338         }
2339
2340         udev = udev_new();
2341         if (!udev)
2342                 return log_oom();
2343
2344         if (fstat(fd, &st) < 0) {
2345                 log_error("Failed to stat block device: %m");
2346                 return -errno;
2347         }
2348
2349         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2350         if (!d)
2351                 return log_oom();
2352
2353         e = udev_enumerate_new(udev);
2354         if (!e)
2355                 return log_oom();
2356
2357         r = udev_enumerate_add_match_parent(e, d);
2358         if (r < 0)
2359                 return log_oom();
2360
2361         r = udev_enumerate_scan_devices(e);
2362         if (r < 0) {
2363                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2364                 return r;
2365         }
2366
2367         first = udev_enumerate_get_list_entry(e);
2368         udev_list_entry_foreach(item, first) {
2369                 _cleanup_udev_device_unref_ struct udev_device *q;
2370                 const char *stype, *node;
2371                 unsigned long long flags;
2372                 sd_id128_t type_id;
2373                 blkid_partition pp;
2374                 dev_t qn;
2375                 int nr;
2376
2377                 errno = 0;
2378                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2379                 if (!q) {
2380                         if (!errno)
2381                                 errno = ENOMEM;
2382
2383                         log_error("Failed to get partition device of %s: %m", arg_image);
2384                         return -errno;
2385                 }
2386
2387                 qn = udev_device_get_devnum(q);
2388                 if (major(qn) == 0)
2389                         continue;
2390
2391                 if (st.st_rdev == qn)
2392                         continue;
2393
2394                 node = udev_device_get_devnode(q);
2395                 if (!node)
2396                         continue;
2397
2398                 pp = blkid_partlist_devno_to_partition(pl, qn);
2399                 if (!pp)
2400                         continue;
2401
2402                 flags = blkid_partition_get_flags(pp);
2403                 if (flags & GPT_FLAG_NO_AUTO)
2404                         continue;
2405
2406                 nr = blkid_partition_get_partno(pp);
2407                 if (nr < 0)
2408                         continue;
2409
2410                 stype = blkid_partition_get_type_string(pp);
2411                 if (!stype)
2412                         continue;
2413
2414                 if (sd_id128_from_string(stype, &type_id) < 0)
2415                         continue;
2416
2417                 if (sd_id128_equal(type_id, GPT_HOME)) {
2418
2419                         if (home && nr >= home_nr)
2420                                 continue;
2421
2422                         home_nr = nr;
2423                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2424
2425                         free(home);
2426                         home = strdup(node);
2427                         if (!home)
2428                                 return log_oom();
2429                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2430
2431                         if (srv && nr >= srv_nr)
2432                                 continue;
2433
2434                         srv_nr = nr;
2435                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2436
2437                         free(srv);
2438                         srv = strdup(node);
2439                         if (!srv)
2440                                 return log_oom();
2441                 }
2442 #ifdef GPT_ROOT_NATIVE
2443                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2444
2445                         if (root && nr >= root_nr)
2446                                 continue;
2447
2448                         root_nr = nr;
2449                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2450
2451                         free(root);
2452                         root = strdup(node);
2453                         if (!root)
2454                                 return log_oom();
2455                 }
2456 #endif
2457 #ifdef GPT_ROOT_SECONDARY
2458                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2459
2460                         if (secondary_root && nr >= secondary_root_nr)
2461                                 continue;
2462
2463                         secondary_root_nr = nr;
2464                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2465
2466
2467                         free(secondary_root);
2468                         secondary_root = strdup(node);
2469                         if (!secondary_root)
2470                                 return log_oom();
2471                 }
2472 #endif
2473         }
2474
2475         if (!root && !secondary_root) {
2476                 log_error("Failed to identify root partition in disk image %s.\n"
2477                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2478                 return -EINVAL;
2479         }
2480
2481         if (root) {
2482                 *root_device = root;
2483                 root = NULL;
2484
2485                 *root_device_rw = root_rw;
2486                 *secondary = false;
2487         } else if (secondary_root) {
2488                 *root_device = secondary_root;
2489                 secondary_root = NULL;
2490
2491                 *root_device_rw = secondary_root_rw;
2492                 *secondary = true;
2493         }
2494
2495         if (home) {
2496                 *home_device = home;
2497                 home = NULL;
2498
2499                 *home_device_rw = home_rw;
2500         }
2501
2502         if (srv) {
2503                 *srv_device = srv;
2504                 srv = NULL;
2505
2506                 *srv_device_rw = srv_rw;
2507         }
2508
2509         return 0;
2510 #else
2511         log_error("--image= is not supported, compiled without blkid support.");
2512         return -ENOTSUP;
2513 #endif
2514 }
2515
2516 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2517 #ifdef HAVE_BLKID
2518         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2519         const char *fstype, *p;
2520         int r;
2521
2522         assert(what);
2523         assert(where);
2524
2525         if (arg_read_only)
2526                 rw = false;
2527
2528         if (directory)
2529                 p = strappenda(where, directory);
2530         else
2531                 p = where;
2532
2533         errno = 0;
2534         b = blkid_new_probe_from_filename(what);
2535         if (!b) {
2536                 if (errno == 0)
2537                         return log_oom();
2538                 log_error("Failed to allocate prober for %s: %m", what);
2539                 return -errno;
2540         }
2541
2542         blkid_probe_enable_superblocks(b, 1);
2543         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2544
2545         errno = 0;
2546         r = blkid_do_safeprobe(b);
2547         if (r == -1 || r == 1) {
2548                 log_error("Cannot determine file system type of %s", what);
2549                 return -EINVAL;
2550         } else if (r != 0) {
2551                 if (errno == 0)
2552                         errno = EIO;
2553                 log_error("Failed to probe %s: %m", what);
2554                 return -errno;
2555         }
2556
2557         errno = 0;
2558         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2559                 if (errno == 0)
2560                         errno = EINVAL;
2561                 log_error("Failed to determine file system type of %s", what);
2562                 return -errno;
2563         }
2564
2565         if (streq(fstype, "crypto_LUKS")) {
2566                 log_error("nspawn currently does not support LUKS disk images.");
2567                 return -ENOTSUP;
2568         }
2569
2570         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2571                 log_error("Failed to mount %s: %m", what);
2572                 return -errno;
2573         }
2574
2575         return 0;
2576 #else
2577         log_error("--image= is not supported, compiled without blkid support.");
2578         return -ENOTSUP;
2579 #endif
2580 }
2581
2582 static int mount_devices(
2583                 const char *where,
2584                 const char *root_device, bool root_device_rw,
2585                 const char *home_device, bool home_device_rw,
2586                 const char *srv_device, bool srv_device_rw) {
2587         int r;
2588
2589         assert(where);
2590
2591         if (root_device) {
2592                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2593                 if (r < 0) {
2594                         log_error("Failed to mount root directory: %s", strerror(-r));
2595                         return r;
2596                 }
2597         }
2598
2599         if (home_device) {
2600                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2601                 if (r < 0) {
2602                         log_error("Failed to mount home directory: %s", strerror(-r));
2603                         return r;
2604                 }
2605         }
2606
2607         if (srv_device) {
2608                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2609                 if (r < 0) {
2610                         log_error("Failed to mount server data directory: %s", strerror(-r));
2611                         return r;
2612                 }
2613         }
2614
2615         return 0;
2616 }
2617
2618 static void loop_remove(int nr, int *image_fd) {
2619         _cleanup_close_ int control = -1;
2620         int r;
2621
2622         if (nr < 0)
2623                 return;
2624
2625         if (image_fd && *image_fd >= 0) {
2626                 r = ioctl(*image_fd, LOOP_CLR_FD);
2627                 if (r < 0)
2628                         log_warning("Failed to close loop image: %m");
2629                 *image_fd = safe_close(*image_fd);
2630         }
2631
2632         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2633         if (control < 0) {
2634                 log_warning("Failed to open /dev/loop-control: %m");
2635                 return;
2636         }
2637
2638         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2639         if (r < 0)
2640                 log_warning("Failed to remove loop %d: %m", nr);
2641 }
2642
2643 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2644         int pipe_fds[2];
2645         pid_t pid;
2646
2647         assert(database);
2648         assert(key);
2649         assert(rpid);
2650
2651         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2652                 log_error("Failed to allocate pipe: %m");
2653                 return -errno;
2654         }
2655
2656         pid = fork();
2657         if (pid < 0) {
2658                 log_error("Failed to fork getent child: %m");
2659                 return -errno;
2660         } else if (pid == 0) {
2661                 int nullfd;
2662                 char *empty_env = NULL;
2663
2664                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2665                         _exit(EXIT_FAILURE);
2666
2667                 if (pipe_fds[0] > 2)
2668                         safe_close(pipe_fds[0]);
2669                 if (pipe_fds[1] > 2)
2670                         safe_close(pipe_fds[1]);
2671
2672                 nullfd = open("/dev/null", O_RDWR);
2673                 if (nullfd < 0)
2674                         _exit(EXIT_FAILURE);
2675
2676                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2677                         _exit(EXIT_FAILURE);
2678
2679                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2680                         _exit(EXIT_FAILURE);
2681
2682                 if (nullfd > 2)
2683                         safe_close(nullfd);
2684
2685                 reset_all_signal_handlers();
2686                 close_all_fds(NULL, 0);
2687
2688                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2689                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2690                 _exit(EXIT_FAILURE);
2691         }
2692
2693         pipe_fds[1] = safe_close(pipe_fds[1]);
2694
2695         *rpid = pid;
2696
2697         return pipe_fds[0];
2698 }
2699
2700 static int change_uid_gid(char **_home) {
2701         char line[LINE_MAX], *x, *u, *g, *h;
2702         const char *word, *state;
2703         _cleanup_free_ uid_t *uids = NULL;
2704         _cleanup_free_ char *home = NULL;
2705         _cleanup_fclose_ FILE *f = NULL;
2706         _cleanup_close_ int fd = -1;
2707         unsigned n_uids = 0;
2708         size_t sz = 0, l;
2709         uid_t uid;
2710         gid_t gid;
2711         pid_t pid;
2712         int r;
2713
2714         assert(_home);
2715
2716         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2717                 /* Reset everything fully to 0, just in case */
2718
2719                 if (setgroups(0, NULL) < 0) {
2720                         log_error("setgroups() failed: %m");
2721                         return -errno;
2722                 }
2723
2724                 if (setresgid(0, 0, 0) < 0) {
2725                         log_error("setregid() failed: %m");
2726                         return -errno;
2727                 }
2728
2729                 if (setresuid(0, 0, 0) < 0) {
2730                         log_error("setreuid() failed: %m");
2731                         return -errno;
2732                 }
2733
2734                 *_home = NULL;
2735                 return 0;
2736         }
2737
2738         /* First, get user credentials */
2739         fd = spawn_getent("passwd", arg_user, &pid);
2740         if (fd < 0)
2741                 return fd;
2742
2743         f = fdopen(fd, "r");
2744         if (!f)
2745                 return log_oom();
2746         fd = -1;
2747
2748         if (!fgets(line, sizeof(line), f)) {
2749
2750                 if (!ferror(f)) {
2751                         log_error("Failed to resolve user %s.", arg_user);
2752                         return -ESRCH;
2753                 }
2754
2755                 log_error("Failed to read from getent: %m");
2756                 return -errno;
2757         }
2758
2759         truncate_nl(line);
2760
2761         wait_for_terminate_and_warn("getent passwd", pid);
2762
2763         x = strchr(line, ':');
2764         if (!x) {
2765                 log_error("/etc/passwd entry has invalid user field.");
2766                 return -EIO;
2767         }
2768
2769         u = strchr(x+1, ':');
2770         if (!u) {
2771                 log_error("/etc/passwd entry has invalid password field.");
2772                 return -EIO;
2773         }
2774
2775         u++;
2776         g = strchr(u, ':');
2777         if (!g) {
2778                 log_error("/etc/passwd entry has invalid UID field.");
2779                 return -EIO;
2780         }
2781
2782         *g = 0;
2783         g++;
2784         x = strchr(g, ':');
2785         if (!x) {
2786                 log_error("/etc/passwd entry has invalid GID field.");
2787                 return -EIO;
2788         }
2789
2790         *x = 0;
2791         h = strchr(x+1, ':');
2792         if (!h) {
2793                 log_error("/etc/passwd entry has invalid GECOS field.");
2794                 return -EIO;
2795         }
2796
2797         h++;
2798         x = strchr(h, ':');
2799         if (!x) {
2800                 log_error("/etc/passwd entry has invalid home directory field.");
2801                 return -EIO;
2802         }
2803
2804         *x = 0;
2805
2806         r = parse_uid(u, &uid);
2807         if (r < 0) {
2808                 log_error("Failed to parse UID of user.");
2809                 return -EIO;
2810         }
2811
2812         r = parse_gid(g, &gid);
2813         if (r < 0) {
2814                 log_error("Failed to parse GID of user.");
2815                 return -EIO;
2816         }
2817
2818         home = strdup(h);
2819         if (!home)
2820                 return log_oom();
2821
2822         /* Second, get group memberships */
2823         fd = spawn_getent("initgroups", arg_user, &pid);
2824         if (fd < 0)
2825                 return fd;
2826
2827         fclose(f);
2828         f = fdopen(fd, "r");
2829         if (!f)
2830                 return log_oom();
2831         fd = -1;
2832
2833         if (!fgets(line, sizeof(line), f)) {
2834                 if (!ferror(f)) {
2835                         log_error("Failed to resolve user %s.", arg_user);
2836                         return -ESRCH;
2837                 }
2838
2839                 log_error("Failed to read from getent: %m");
2840                 return -errno;
2841         }
2842
2843         truncate_nl(line);
2844
2845         wait_for_terminate_and_warn("getent initgroups", pid);
2846
2847         /* Skip over the username and subsequent separator whitespace */
2848         x = line;
2849         x += strcspn(x, WHITESPACE);
2850         x += strspn(x, WHITESPACE);
2851
2852         FOREACH_WORD(word, l, x, state) {
2853                 char c[l+1];
2854
2855                 memcpy(c, word, l);
2856                 c[l] = 0;
2857
2858                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2859                         return log_oom();
2860
2861                 r = parse_uid(c, &uids[n_uids++]);
2862                 if (r < 0) {
2863                         log_error("Failed to parse group data from getent.");
2864                         return -EIO;
2865                 }
2866         }
2867
2868         r = mkdir_parents(home, 0775);
2869         if (r < 0) {
2870                 log_error("Failed to make home root directory: %s", strerror(-r));
2871                 return r;
2872         }
2873
2874         r = mkdir_safe(home, 0755, uid, gid);
2875         if (r < 0 && r != -EEXIST) {
2876                 log_error("Failed to make home directory: %s", strerror(-r));
2877                 return r;
2878         }
2879
2880         fchown(STDIN_FILENO, uid, gid);
2881         fchown(STDOUT_FILENO, uid, gid);
2882         fchown(STDERR_FILENO, uid, gid);
2883
2884         if (setgroups(n_uids, uids) < 0) {
2885                 log_error("Failed to set auxiliary groups: %m");
2886                 return -errno;
2887         }
2888
2889         if (setresgid(gid, gid, gid) < 0) {
2890                 log_error("setregid() failed: %m");
2891                 return -errno;
2892         }
2893
2894         if (setresuid(uid, uid, uid) < 0) {
2895                 log_error("setreuid() failed: %m");
2896                 return -errno;
2897         }
2898
2899         if (_home) {
2900                 *_home = home;
2901                 home = NULL;
2902         }
2903
2904         return 0;
2905 }
2906
2907 /*
2908  * Return values:
2909  * < 0 : wait_for_terminate() failed to get the state of the
2910  *       container, the container was terminated by a signal, or
2911  *       failed for an unknown reason.  No change is made to the
2912  *       container argument.
2913  * > 0 : The program executed in the container terminated with an
2914  *       error.  The exit code of the program executed in the
2915  *       container is returned.  The container argument has been set
2916  *       to CONTAINER_TERMINATED.
2917  *   0 : The container is being rebooted, has been shut down or exited
2918  *       successfully.  The container argument has been set to either
2919  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2920  *
2921  * That is, success is indicated by a return value of zero, and an
2922  * error is indicated by a non-zero value.
2923  */
2924 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2925         siginfo_t status;
2926         int r;
2927
2928         r = wait_for_terminate(pid, &status);
2929         if (r < 0) {
2930                 log_warning("Failed to wait for container: %s", strerror(-r));
2931                 return r;
2932         }
2933
2934         switch (status.si_code) {
2935
2936         case CLD_EXITED:
2937                 if (status.si_status == 0) {
2938                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2939
2940                 } else
2941                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2942
2943                 *container = CONTAINER_TERMINATED;
2944                 return status.si_status;
2945
2946         case CLD_KILLED:
2947                 if (status.si_status == SIGINT) {
2948
2949                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2950                         *container = CONTAINER_TERMINATED;
2951                         return 0;
2952
2953                 } else if (status.si_status == SIGHUP) {
2954
2955                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2956                         *container = CONTAINER_REBOOTED;
2957                         return 0;
2958                 }
2959
2960                 /* CLD_KILLED fallthrough */
2961
2962         case CLD_DUMPED:
2963                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2964                 return -EIO;
2965
2966         default:
2967                 log_error("Container %s failed due to unknown reason.", arg_machine);
2968                 return -EIO;
2969         }
2970
2971         return r;
2972 }
2973
2974 static void nop_handler(int sig) {}
2975
2976 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2977         pid_t pid;
2978
2979         pid = PTR_TO_UINT32(userdata);
2980         if (pid > 0) {
2981                 if (kill(pid, SIGRTMIN+3) >= 0) {
2982                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2983                         sd_event_source_set_userdata(s, NULL);
2984                         return 0;
2985                 }
2986         }
2987
2988         sd_event_exit(sd_event_source_get_event(s), 0);
2989         return 0;
2990 }
2991
2992 int main(int argc, char *argv[]) {
2993
2994         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2995         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2996         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2997         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2998         _cleanup_fdset_free_ FDSet *fds = NULL;
2999         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
3000         const char *console = NULL;
3001         char veth_name[IFNAMSIZ];
3002         bool secondary = false;
3003         sigset_t mask, mask_chld;
3004         pid_t pid = 0;
3005
3006         log_parse_environment();
3007         log_open();
3008
3009         k = parse_argv(argc, argv);
3010         if (k < 0)
3011                 goto finish;
3012         else if (k == 0) {
3013                 r = EXIT_SUCCESS;
3014                 goto finish;
3015         }
3016
3017         if (!arg_image) {
3018                 if (arg_directory) {
3019                         char *p;
3020
3021                         p = path_make_absolute_cwd(arg_directory);
3022                         free(arg_directory);
3023                         arg_directory = p;
3024                 } else
3025                         arg_directory = get_current_dir_name();
3026
3027                 if (!arg_directory) {
3028                         log_error("Failed to determine path, please use -D.");
3029                         goto finish;
3030                 }
3031                 path_kill_slashes(arg_directory);
3032         }
3033
3034         if (!arg_machine) {
3035                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
3036                 if (!arg_machine) {
3037                         log_oom();
3038                         goto finish;
3039                 }
3040
3041                 hostname_cleanup(arg_machine, false);
3042                 if (isempty(arg_machine)) {
3043                         log_error("Failed to determine machine name automatically, please use -M.");
3044                         goto finish;
3045                 }
3046         }
3047
3048         if (geteuid() != 0) {
3049                 log_error("Need to be root.");
3050                 goto finish;
3051         }
3052
3053         if (sd_booted() <= 0) {
3054                 log_error("Not running on a systemd system.");
3055                 goto finish;
3056         }
3057
3058         log_close();
3059         n_fd_passed = sd_listen_fds(false);
3060         if (n_fd_passed > 0) {
3061                 k = fdset_new_listen_fds(&fds, false);
3062                 if (k < 0) {
3063                         log_error("Failed to collect file descriptors: %s", strerror(-k));
3064                         goto finish;
3065                 }
3066         }
3067         fdset_close_others(fds);
3068         log_open();
3069
3070         if (arg_directory) {
3071                 if (path_equal(arg_directory, "/")) {
3072                         log_error("Spawning container on root directory not supported.");
3073                         goto finish;
3074                 }
3075
3076                 if (arg_boot) {
3077                         if (path_is_os_tree(arg_directory) <= 0) {
3078                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
3079                                 goto finish;
3080                         }
3081                 } else {
3082                         const char *p;
3083
3084                         p = strappenda(arg_directory,
3085                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
3086                         if (access(p, F_OK) < 0) {
3087                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
3088                                 goto finish;
3089
3090                         }
3091                 }
3092         } else {
3093                 char template[] = "/tmp/nspawn-root-XXXXXX";
3094
3095                 if (!mkdtemp(template)) {
3096                         log_error("Failed to create temporary directory: %m");
3097                         r = -errno;
3098                         goto finish;
3099                 }
3100
3101                 arg_directory = strdup(template);
3102                 if (!arg_directory) {
3103                         r = log_oom();
3104                         goto finish;
3105                 }
3106
3107                 image_fd = setup_image(&device_path, &loop_nr);
3108                 if (image_fd < 0) {
3109                         r = image_fd;
3110                         goto finish;
3111                 }
3112
3113                 r = dissect_image(image_fd,
3114                                   &root_device, &root_device_rw,
3115                                   &home_device, &home_device_rw,
3116                                   &srv_device, &srv_device_rw,
3117                                   &secondary);
3118                 if (r < 0)
3119                         goto finish;
3120         }
3121
3122         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3123         if (master < 0) {
3124                 log_error("Failed to acquire pseudo tty: %m");
3125                 goto finish;
3126         }
3127
3128         console = ptsname(master);
3129         if (!console) {
3130                 log_error("Failed to determine tty name: %m");
3131                 goto finish;
3132         }
3133
3134         if (!arg_quiet)
3135                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3136                          arg_machine, arg_image ? arg_image : arg_directory);
3137
3138         if (unlockpt(master) < 0) {
3139                 log_error("Failed to unlock tty: %m");
3140                 goto finish;
3141         }
3142
3143         if (access("/dev/kdbus/control", F_OK) >= 0) {
3144
3145                 if (arg_share_system) {
3146                         kdbus_domain = strdup("/dev/kdbus");
3147                         if (!kdbus_domain) {
3148                                 log_oom();
3149                                 goto finish;
3150                         }
3151                 } else {
3152                         const char *ns;
3153
3154                         ns = strappenda("machine-", arg_machine);
3155                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3156                         if (r < 0)
3157                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3158                         else
3159                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3160                 }
3161         }
3162
3163         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3164                 log_error("Failed to create kmsg socket pair: %m");
3165                 goto finish;
3166         }
3167
3168         sd_notify(false,
3169                   "READY=1\n"
3170                   "STATUS=Container running.");
3171
3172         assert_se(sigemptyset(&mask) == 0);
3173         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3174         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3175
3176         assert_se(sigemptyset(&mask_chld) == 0);
3177         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3178
3179         for (;;) {
3180                 ContainerStatus container_status;
3181                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3182                 struct sigaction sa = {
3183                         .sa_handler = nop_handler,
3184                         .sa_flags = SA_NOCLDSTOP,
3185                 };
3186
3187                 r = barrier_create(&barrier);
3188                 if (r < 0) {
3189                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3190                         goto finish;
3191                 }
3192
3193                 /* Child can be killed before execv(), so handle SIGCHLD
3194                  * in order to interrupt parent's blocking calls and
3195                  * give it a chance to call wait() and terminate. */
3196                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3197                 if (r < 0) {
3198                         log_error("Failed to change the signal mask: %m");
3199                         goto finish;
3200                 }
3201
3202                 r = sigaction(SIGCHLD, &sa, NULL);
3203                 if (r < 0) {
3204                         log_error("Failed to install SIGCHLD handler: %m");
3205                         goto finish;
3206                 }
3207
3208                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3209                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3210                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3211                 if (pid < 0) {
3212                         if (errno == EINVAL)
3213                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3214                         else
3215                                 log_error("clone() failed: %m");
3216
3217                         r = pid;
3218                         goto finish;
3219                 }
3220
3221                 if (pid == 0) {
3222                         /* child */
3223                         _cleanup_free_ char *home = NULL;
3224                         unsigned n_env = 2;
3225                         const char *envp[] = {
3226                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3227                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3228                                 NULL, /* TERM */
3229                                 NULL, /* HOME */
3230                                 NULL, /* USER */
3231                                 NULL, /* LOGNAME */
3232                                 NULL, /* container_uuid */
3233                                 NULL, /* LISTEN_FDS */
3234                                 NULL, /* LISTEN_PID */
3235                                 NULL
3236                         };
3237                         char **env_use;
3238
3239                         barrier_set_role(&barrier, BARRIER_CHILD);
3240
3241                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3242                         if (envp[n_env])
3243                                 n_env ++;
3244
3245                         master = safe_close(master);
3246
3247                         close_nointr(STDIN_FILENO);
3248                         close_nointr(STDOUT_FILENO);
3249                         close_nointr(STDERR_FILENO);
3250
3251                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3252
3253                         reset_all_signal_handlers();
3254                         reset_signal_mask();
3255
3256                         k = open_terminal(console, O_RDWR);
3257                         if (k != STDIN_FILENO) {
3258                                 if (k >= 0) {
3259                                         safe_close(k);
3260                                         k = -EINVAL;
3261                                 }
3262
3263                                 log_error("Failed to open console: %s", strerror(-k));
3264                                 _exit(EXIT_FAILURE);
3265                         }
3266
3267                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3268                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3269                                 log_error("Failed to duplicate console: %m");
3270                                 _exit(EXIT_FAILURE);
3271                         }
3272
3273                         if (setsid() < 0) {
3274                                 log_error("setsid() failed: %m");
3275                                 _exit(EXIT_FAILURE);
3276                         }
3277
3278                         if (reset_audit_loginuid() < 0)
3279                                 _exit(EXIT_FAILURE);
3280
3281                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3282                                 log_error("PR_SET_PDEATHSIG failed: %m");
3283                                 _exit(EXIT_FAILURE);
3284                         }
3285
3286                         /* Mark everything as slave, so that we still
3287                          * receive mounts from the real root, but don't
3288                          * propagate mounts to the real root. */
3289                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3290                                 log_error("MS_SLAVE|MS_REC failed: %m");
3291                                 _exit(EXIT_FAILURE);
3292                         }
3293
3294                         if (mount_devices(arg_directory,
3295                                           root_device, root_device_rw,
3296                                           home_device, home_device_rw,
3297                                           srv_device, srv_device_rw) < 0)
3298                                 _exit(EXIT_FAILURE);
3299
3300                         /* Turn directory into bind mount */
3301                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3302                                 log_error("Failed to make bind mount: %m");
3303                                 _exit(EXIT_FAILURE);
3304                         }
3305
3306                         r = setup_volatile(arg_directory);
3307                         if (r < 0)
3308                                 _exit(EXIT_FAILURE);
3309
3310                         if (setup_volatile_state(arg_directory) < 0)
3311                                 _exit(EXIT_FAILURE);
3312
3313                         r = base_filesystem_create(arg_directory);
3314                         if (r < 0)
3315                                 _exit(EXIT_FAILURE);
3316
3317                         if (arg_read_only) {
3318                                 k = bind_remount_recursive(arg_directory, true);
3319                                 if (k < 0) {
3320                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3321                                         _exit(EXIT_FAILURE);
3322                                 }
3323                         }
3324
3325                         if (mount_all(arg_directory) < 0)
3326                                 _exit(EXIT_FAILURE);
3327
3328                         if (copy_devnodes(arg_directory) < 0)
3329                                 _exit(EXIT_FAILURE);
3330
3331                         if (setup_ptmx(arg_directory) < 0)
3332                                 _exit(EXIT_FAILURE);
3333
3334                         dev_setup(arg_directory);
3335
3336                         if (setup_seccomp() < 0)
3337                                 _exit(EXIT_FAILURE);
3338
3339                         if (setup_dev_console(arg_directory, console) < 0)
3340                                 _exit(EXIT_FAILURE);
3341
3342                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3343                                 _exit(EXIT_FAILURE);
3344
3345                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3346
3347                         if (setup_boot_id(arg_directory) < 0)
3348                                 _exit(EXIT_FAILURE);
3349
3350                         if (setup_timezone(arg_directory) < 0)
3351                                 _exit(EXIT_FAILURE);
3352
3353                         if (setup_resolv_conf(arg_directory) < 0)
3354                                 _exit(EXIT_FAILURE);
3355
3356                         if (setup_journal(arg_directory) < 0)
3357                                 _exit(EXIT_FAILURE);
3358
3359                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3360                                 _exit(EXIT_FAILURE);
3361
3362                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3363                                 _exit(EXIT_FAILURE);
3364
3365                         if (mount_tmpfs(arg_directory) < 0)
3366                                 _exit(EXIT_FAILURE);
3367
3368                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3369                                 _exit(EXIT_FAILURE);
3370
3371                         /* Tell the parent that we are ready, and that
3372                          * it can cgroupify us to that we lack access
3373                          * to certain devices and resources. */
3374                         (void)barrier_place(&barrier);
3375
3376                         if (chdir(arg_directory) < 0) {
3377                                 log_error("chdir(%s) failed: %m", arg_directory);
3378                                 _exit(EXIT_FAILURE);
3379                         }
3380
3381                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3382                                 log_error("mount(MS_MOVE) failed: %m");
3383                                 _exit(EXIT_FAILURE);
3384                         }
3385
3386                         if (chroot(".") < 0) {
3387                                 log_error("chroot() failed: %m");
3388                                 _exit(EXIT_FAILURE);
3389                         }
3390
3391                         if (chdir("/") < 0) {
3392                                 log_error("chdir() failed: %m");
3393                                 _exit(EXIT_FAILURE);
3394                         }
3395
3396                         umask(0022);
3397
3398                         if (arg_private_network)
3399                                 loopback_setup();
3400
3401                         if (drop_capabilities() < 0) {
3402                                 log_error("drop_capabilities() failed: %m");
3403                                 _exit(EXIT_FAILURE);
3404                         }
3405
3406                         r = change_uid_gid(&home);
3407                         if (r < 0)
3408                                 _exit(EXIT_FAILURE);
3409
3410                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3411                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3412                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3413                                 log_oom();
3414                                 _exit(EXIT_FAILURE);
3415                         }
3416
3417                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3418                                 char as_uuid[37];
3419
3420                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3421                                         log_oom();
3422                                         _exit(EXIT_FAILURE);
3423                                 }
3424                         }
3425
3426                         if (fdset_size(fds) > 0) {
3427                                 k = fdset_cloexec(fds, false);
3428                                 if (k < 0) {
3429                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3430                                         _exit(EXIT_FAILURE);
3431                                 }
3432
3433                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3434                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3435                                         log_oom();
3436                                         _exit(EXIT_FAILURE);
3437                                 }
3438                         }
3439
3440                         setup_hostname();
3441
3442                         if (arg_personality != 0xffffffffLU) {
3443                                 if (personality(arg_personality) < 0) {
3444                                         log_error("personality() failed: %m");
3445                                         _exit(EXIT_FAILURE);
3446                                 }
3447                         } else if (secondary) {
3448                                 if (personality(PER_LINUX32) < 0) {
3449                                         log_error("personality() failed: %m");
3450                                         _exit(EXIT_FAILURE);
3451                                 }
3452                         }
3453
3454 #ifdef HAVE_SELINUX
3455                         if (arg_selinux_context)
3456                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3457                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3458                                         _exit(EXIT_FAILURE);
3459                                 }
3460 #endif
3461
3462                         if (!strv_isempty(arg_setenv)) {
3463                                 char **n;
3464
3465                                 n = strv_env_merge(2, envp, arg_setenv);
3466                                 if (!n) {
3467                                         log_oom();
3468                                         _exit(EXIT_FAILURE);
3469                                 }
3470
3471                                 env_use = n;
3472                         } else
3473                                 env_use = (char**) envp;
3474
3475                         /* Wait until the parent is ready with the setup, too... */
3476                         if (!barrier_place_and_sync(&barrier))
3477                                 _exit(EXIT_FAILURE);
3478
3479                         if (arg_boot) {
3480                                 char **a;
3481                                 size_t l;
3482
3483                                 /* Automatically search for the init system */
3484
3485                                 l = 1 + argc - optind;
3486                                 a = newa(char*, l + 1);
3487                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3488
3489                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3490                                 execve(a[0], a, env_use);
3491
3492                                 a[0] = (char*) "/lib/systemd/systemd";
3493                                 execve(a[0], a, env_use);
3494
3495                                 a[0] = (char*) "/sbin/init";
3496                                 execve(a[0], a, env_use);
3497                         } else if (argc > optind)
3498                                 execvpe(argv[optind], argv + optind, env_use);
3499                         else {
3500                                 chdir(home ? home : "/root");
3501                                 execle("/bin/bash", "-bash", NULL, env_use);
3502                                 execle("/bin/sh", "-sh", NULL, env_use);
3503                         }
3504
3505                         log_error("execv() failed: %m");
3506                         _exit(EXIT_FAILURE);
3507                 }
3508
3509                 barrier_set_role(&barrier, BARRIER_PARENT);
3510                 fdset_free(fds);
3511                 fds = NULL;
3512
3513                 /* wait for child-setup to be done */
3514                 if (barrier_place_and_sync(&barrier)) {
3515                         _cleanup_event_unref_ sd_event *event = NULL;
3516                         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
3517                         int ifi = 0;
3518
3519                         r = move_network_interfaces(pid);
3520                         if (r < 0)
3521                                 goto finish;
3522
3523                         r = setup_veth(pid, veth_name, &ifi);
3524                         if (r < 0)
3525                                 goto finish;
3526
3527                         r = setup_bridge(veth_name, &ifi);
3528                         if (r < 0)
3529                                 goto finish;
3530
3531                         r = setup_macvlan(pid);
3532                         if (r < 0)
3533                                 goto finish;
3534
3535                         r = register_machine(pid, ifi);
3536                         if (r < 0)
3537                                 goto finish;
3538
3539                         /* Block SIGCHLD here, before notifying child.
3540                          * process_pty() will handle it with the other signals. */
3541                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3542                         if (r < 0)
3543                                 goto finish;
3544
3545                         /* Reset signal to default */
3546                         r = default_signals(SIGCHLD, -1);
3547                         if (r < 0)
3548                                 goto finish;
3549
3550                         /* Notify the child that the parent is ready with all
3551                          * its setup, and that the child can now hand over
3552                          * control to the code to run inside the container. */
3553                         (void)barrier_place(&barrier);
3554
3555                         r = sd_event_new(&event);
3556                         if (r < 0) {
3557                                 log_error("Failed to get default event source: %s", strerror(-r));
3558                                 goto finish;
3559                         }
3560
3561                         if (arg_boot) {
3562                                 /* Try to kill the init system on SIGINT or SIGTERM */
3563                                 sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, UINT32_TO_PTR(pid));
3564                                 sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, UINT32_TO_PTR(pid));
3565                         } else {
3566                                 /* Immediately exit */
3567                                 sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
3568                                 sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
3569                         }
3570
3571                         /* simply exit on sigchld */
3572                         sd_event_add_signal(event, NULL, SIGCHLD, NULL, NULL);
3573
3574                         r = pty_forward_new(event, master, &forward);
3575                         if (r < 0) {
3576                                 log_error("Failed to create PTY forwarder: %s", strerror(-r));
3577                                 goto finish;
3578                         }
3579
3580                         r = sd_event_loop(event);
3581                         if (r < 0) {
3582                                 log_error("Failed to run event loop: %s", strerror(-r));
3583                                 return r;
3584                         }
3585
3586                         forward = pty_forward_free(forward);
3587
3588                         if (!arg_quiet)
3589                                 putc('\n', stdout);
3590
3591                         /* Kill if it is not dead yet anyway */
3592                         terminate_machine(pid);
3593                 }
3594
3595                 /* Normally redundant, but better safe than sorry */
3596                 kill(pid, SIGKILL);
3597
3598                 r = wait_for_container(pid, &container_status);
3599                 pid = 0;
3600
3601                 if (r < 0) {
3602                         /* We failed to wait for the container, or the
3603                          * container exited abnormally */
3604                         r = EXIT_FAILURE;
3605                         break;
3606                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3607                         /* The container exited with a non-zero
3608                          * status, or with zero status and no reboot
3609                          * was requested. */
3610                         break;
3611
3612                 /* CONTAINER_REBOOTED, loop again */
3613
3614                 if (arg_keep_unit) {
3615                         /* Special handling if we are running as a
3616                          * service: instead of simply restarting the
3617                          * machine we want to restart the entire
3618                          * service, so let's inform systemd about this
3619                          * with the special exit code 133. The service
3620                          * file uses RestartForceExitStatus=133 so
3621                          * that this results in a full nspawn
3622                          * restart. This is necessary since we might
3623                          * have cgroup parameters set we want to have
3624                          * flushed out. */
3625                         r = 133;
3626                         break;
3627                 }
3628         }
3629
3630 finish:
3631         sd_notify(false,
3632                   "STOPPING=1\n"
3633                   "STATUS=Terminating...");
3634
3635         loop_remove(loop_nr, &image_fd);
3636
3637         if (pid > 0)
3638                 kill(pid, SIGKILL);
3639
3640         free(arg_directory);
3641         free(arg_machine);
3642         free(arg_user);
3643         strv_free(arg_setenv);
3644         strv_free(arg_network_interfaces);
3645         strv_free(arg_network_macvlan);
3646         strv_free(arg_bind);
3647         strv_free(arg_bind_ro);
3648         strv_free(arg_tmpfs);
3649
3650         return r;
3651 }