chiark / gitweb /
add new systemd-escape tool
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91 #include "copy.h"
92 #include "base-filesystem.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static uint64_t arg_retain =
128         (1ULL << CAP_CHOWN) |
129         (1ULL << CAP_DAC_OVERRIDE) |
130         (1ULL << CAP_DAC_READ_SEARCH) |
131         (1ULL << CAP_FOWNER) |
132         (1ULL << CAP_FSETID) |
133         (1ULL << CAP_IPC_OWNER) |
134         (1ULL << CAP_KILL) |
135         (1ULL << CAP_LEASE) |
136         (1ULL << CAP_LINUX_IMMUTABLE) |
137         (1ULL << CAP_NET_BIND_SERVICE) |
138         (1ULL << CAP_NET_BROADCAST) |
139         (1ULL << CAP_NET_RAW) |
140         (1ULL << CAP_SETGID) |
141         (1ULL << CAP_SETFCAP) |
142         (1ULL << CAP_SETPCAP) |
143         (1ULL << CAP_SETUID) |
144         (1ULL << CAP_SYS_ADMIN) |
145         (1ULL << CAP_SYS_CHROOT) |
146         (1ULL << CAP_SYS_NICE) |
147         (1ULL << CAP_SYS_PTRACE) |
148         (1ULL << CAP_SYS_TTY_CONFIG) |
149         (1ULL << CAP_SYS_RESOURCE) |
150         (1ULL << CAP_SYS_BOOT) |
151         (1ULL << CAP_AUDIT_WRITE) |
152         (1ULL << CAP_AUDIT_CONTROL) |
153         (1ULL << CAP_MKNOD);
154 static char **arg_bind = NULL;
155 static char **arg_bind_ro = NULL;
156 static char **arg_tmpfs = NULL;
157 static char **arg_setenv = NULL;
158 static bool arg_quiet = false;
159 static bool arg_share_system = false;
160 static bool arg_register = true;
161 static bool arg_keep_unit = false;
162 static char **arg_network_interfaces = NULL;
163 static char **arg_network_macvlan = NULL;
164 static bool arg_network_veth = false;
165 static const char *arg_network_bridge = NULL;
166 static unsigned long arg_personality = 0xffffffffLU;
167 static const char *arg_image = NULL;
168 static Volatile arg_volatile = VOLATILE_NO;
169
170 static int help(void) {
171
172         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174                "  -h --help                 Show this help\n"
175                "     --version              Print version string\n"
176                "  -q --quiet                Do not show status information\n"
177                "  -D --directory=PATH       Root directory for the container\n"
178                "  -i --image=PATH           File system device or image for the container\n"
179                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
180                "  -u --user=USER            Run the command under specified user or uid\n"
181                "  -M --machine=NAME         Set the machine name for the container\n"
182                "     --uuid=UUID            Set a specific machine UUID for the container\n"
183                "  -S --slice=SLICE          Place the container in the specified slice\n"
184                "     --private-network      Disable network in container\n"
185                "     --network-interface=INTERFACE\n"
186                "                            Assign an existing network interface to the\n"
187                "                            container\n"
188                "     --network-macvlan=INTERFACE\n"
189                "                            Create a macvlan network interface based on an\n"
190                "                            existing network interface to the container\n"
191                "     --network-veth         Add a virtual ethernet connection between host\n"
192                "                            and container\n"
193                "     --network-bridge=INTERFACE\n"
194                "                            Add a virtual ethernet connection between host\n"
195                "                            and container and add it to an existing bridge on\n"
196                "                            the host\n"
197                "  -Z --selinux-context=SECLABEL\n"
198                "                            Set the SELinux security context to be used by\n"
199                "                            processes in the container\n"
200                "  -L --selinux-apifs-context=SECLABEL\n"
201                "                            Set the SELinux security context to be used by\n"
202                "                            API/tmpfs file systems in the container\n"
203                "     --capability=CAP       In addition to the default, retain specified\n"
204                "                            capability\n"
205                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
206                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
207                "  -j                        Equivalent to --link-journal=host\n"
208                "     --read-only            Mount the root directory read-only\n"
209                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
210                "                            the container\n"
211                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
212                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
213                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
214                "     --share-system         Share system namespaces with host\n"
215                "     --register=BOOLEAN     Register container as machine\n"
216                "     --keep-unit            Do not register a scope for the machine, reuse\n"
217                "                            the service unit nspawn is running in\n"
218                "     --volatile[=MODE]      Run the system in volatile mode\n",
219                program_invocation_short_name);
220
221         return 0;
222 }
223
224 static int parse_argv(int argc, char *argv[]) {
225
226         enum {
227                 ARG_VERSION = 0x100,
228                 ARG_PRIVATE_NETWORK,
229                 ARG_UUID,
230                 ARG_READ_ONLY,
231                 ARG_CAPABILITY,
232                 ARG_DROP_CAPABILITY,
233                 ARG_LINK_JOURNAL,
234                 ARG_BIND,
235                 ARG_BIND_RO,
236                 ARG_TMPFS,
237                 ARG_SETENV,
238                 ARG_SHARE_SYSTEM,
239                 ARG_REGISTER,
240                 ARG_KEEP_UNIT,
241                 ARG_NETWORK_INTERFACE,
242                 ARG_NETWORK_MACVLAN,
243                 ARG_NETWORK_VETH,
244                 ARG_NETWORK_BRIDGE,
245                 ARG_PERSONALITY,
246                 ARG_VOLATILE,
247         };
248
249         static const struct option options[] = {
250                 { "help",                  no_argument,       NULL, 'h'                   },
251                 { "version",               no_argument,       NULL, ARG_VERSION           },
252                 { "directory",             required_argument, NULL, 'D'                   },
253                 { "user",                  required_argument, NULL, 'u'                   },
254                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
255                 { "boot",                  no_argument,       NULL, 'b'                   },
256                 { "uuid",                  required_argument, NULL, ARG_UUID              },
257                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
258                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
259                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
260                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
261                 { "bind",                  required_argument, NULL, ARG_BIND              },
262                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
263                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
264                 { "machine",               required_argument, NULL, 'M'                   },
265                 { "slice",                 required_argument, NULL, 'S'                   },
266                 { "setenv",                required_argument, NULL, ARG_SETENV            },
267                 { "selinux-context",       required_argument, NULL, 'Z'                   },
268                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
269                 { "quiet",                 no_argument,       NULL, 'q'                   },
270                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
271                 { "register",              required_argument, NULL, ARG_REGISTER          },
272                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
273                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
274                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
275                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
276                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
277                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
278                 { "image",                 required_argument, NULL, 'i'                   },
279                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
280                 {}
281         };
282
283         int c, r;
284         uint64_t plus = 0, minus = 0;
285
286         assert(argc >= 0);
287         assert(argv);
288
289         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
290
291                 switch (c) {
292
293                 case 'h':
294                         return help();
295
296                 case ARG_VERSION:
297                         puts(PACKAGE_STRING);
298                         puts(SYSTEMD_FEATURES);
299                         return 0;
300
301                 case 'D':
302                         free(arg_directory);
303                         arg_directory = canonicalize_file_name(optarg);
304                         if (!arg_directory) {
305                                 log_error("Invalid root directory: %m");
306                                 return -ENOMEM;
307                         }
308
309                         break;
310
311                 case 'i':
312                         arg_image = optarg;
313                         break;
314
315                 case 'u':
316                         free(arg_user);
317                         arg_user = strdup(optarg);
318                         if (!arg_user)
319                                 return log_oom();
320
321                         break;
322
323                 case ARG_NETWORK_BRIDGE:
324                         arg_network_bridge = optarg;
325
326                         /* fall through */
327
328                 case ARG_NETWORK_VETH:
329                         arg_network_veth = true;
330                         arg_private_network = true;
331                         break;
332
333                 case ARG_NETWORK_INTERFACE:
334                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
335                                 return log_oom();
336
337                         arg_private_network = true;
338                         break;
339
340                 case ARG_NETWORK_MACVLAN:
341                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
342                                 return log_oom();
343
344                         /* fall through */
345
346                 case ARG_PRIVATE_NETWORK:
347                         arg_private_network = true;
348                         break;
349
350                 case 'b':
351                         arg_boot = true;
352                         break;
353
354                 case ARG_UUID:
355                         r = sd_id128_from_string(optarg, &arg_uuid);
356                         if (r < 0) {
357                                 log_error("Invalid UUID: %s", optarg);
358                                 return r;
359                         }
360                         break;
361
362                 case 'S':
363                         arg_slice = optarg;
364                         break;
365
366                 case 'M':
367                         if (isempty(optarg)) {
368                                 free(arg_machine);
369                                 arg_machine = NULL;
370                         } else {
371
372                                 if (!hostname_is_valid(optarg)) {
373                                         log_error("Invalid machine name: %s", optarg);
374                                         return -EINVAL;
375                                 }
376
377                                 free(arg_machine);
378                                 arg_machine = strdup(optarg);
379                                 if (!arg_machine)
380                                         return log_oom();
381
382                                 break;
383                         }
384
385                 case 'Z':
386                         arg_selinux_context = optarg;
387                         break;
388
389                 case 'L':
390                         arg_selinux_apifs_context = optarg;
391                         break;
392
393                 case ARG_READ_ONLY:
394                         arg_read_only = true;
395                         break;
396
397                 case ARG_CAPABILITY:
398                 case ARG_DROP_CAPABILITY: {
399                         char *state, *word;
400                         size_t length;
401
402                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403                                 _cleanup_free_ char *t;
404                                 cap_value_t cap;
405
406                                 t = strndup(word, length);
407                                 if (!t)
408                                         return log_oom();
409
410                                 if (streq(t, "all")) {
411                                         if (c == ARG_CAPABILITY)
412                                                 plus = (uint64_t) -1;
413                                         else
414                                                 minus = (uint64_t) -1;
415                                 } else {
416                                         if (cap_from_name(t, &cap) < 0) {
417                                                 log_error("Failed to parse capability %s.", t);
418                                                 return -EINVAL;
419                                         }
420
421                                         if (c == ARG_CAPABILITY)
422                                                 plus |= 1ULL << (uint64_t) cap;
423                                         else
424                                                 minus |= 1ULL << (uint64_t) cap;
425                                 }
426                         }
427
428                         break;
429                 }
430
431                 case 'j':
432                         arg_link_journal = LINK_GUEST;
433                         break;
434
435                 case ARG_LINK_JOURNAL:
436                         if (streq(optarg, "auto"))
437                                 arg_link_journal = LINK_AUTO;
438                         else if (streq(optarg, "no"))
439                                 arg_link_journal = LINK_NO;
440                         else if (streq(optarg, "guest"))
441                                 arg_link_journal = LINK_GUEST;
442                         else if (streq(optarg, "host"))
443                                 arg_link_journal = LINK_HOST;
444                         else {
445                                 log_error("Failed to parse link journal mode %s", optarg);
446                                 return -EINVAL;
447                         }
448
449                         break;
450
451                 case ARG_BIND:
452                 case ARG_BIND_RO: {
453                         _cleanup_free_ char *a = NULL, *b = NULL;
454                         char *e;
455                         char ***x;
456
457                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
458
459                         e = strchr(optarg, ':');
460                         if (e) {
461                                 a = strndup(optarg, e - optarg);
462                                 b = strdup(e + 1);
463                         } else {
464                                 a = strdup(optarg);
465                                 b = strdup(optarg);
466                         }
467
468                         if (!a || !b)
469                                 return log_oom();
470
471                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
472                                 log_error("Invalid bind mount specification: %s", optarg);
473                                 return -EINVAL;
474                         }
475
476                         r = strv_extend(x, a);
477                         if (r < 0)
478                                 return log_oom();
479
480                         r = strv_extend(x, b);
481                         if (r < 0)
482                                 return log_oom();
483
484                         break;
485                 }
486
487                 case ARG_TMPFS: {
488                         _cleanup_free_ char *a = NULL, *b = NULL;
489                         char *e;
490
491                         e = strchr(optarg, ':');
492                         if (e) {
493                                 a = strndup(optarg, e - optarg);
494                                 b = strdup(e + 1);
495                         } else {
496                                 a = strdup(optarg);
497                                 b = strdup("mode=0755");
498                         }
499
500                         if (!a || !b)
501                                 return log_oom();
502
503                         if (!path_is_absolute(a)) {
504                                 log_error("Invalid tmpfs specification: %s", optarg);
505                                 return -EINVAL;
506                         }
507
508                         r = strv_push(&arg_tmpfs, a);
509                         if (r < 0)
510                                 return log_oom();
511
512                         a = NULL;
513
514                         r = strv_push(&arg_tmpfs, b);
515                         if (r < 0)
516                                 return log_oom();
517
518                         b = NULL;
519
520                         break;
521                 }
522
523                 case ARG_SETENV: {
524                         char **n;
525
526                         if (!env_assignment_is_valid(optarg)) {
527                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
528                                 return -EINVAL;
529                         }
530
531                         n = strv_env_set(arg_setenv, optarg);
532                         if (!n)
533                                 return log_oom();
534
535                         strv_free(arg_setenv);
536                         arg_setenv = n;
537                         break;
538                 }
539
540                 case 'q':
541                         arg_quiet = true;
542                         break;
543
544                 case ARG_SHARE_SYSTEM:
545                         arg_share_system = true;
546                         break;
547
548                 case ARG_REGISTER:
549                         r = parse_boolean(optarg);
550                         if (r < 0) {
551                                 log_error("Failed to parse --register= argument: %s", optarg);
552                                 return r;
553                         }
554
555                         arg_register = r;
556                         break;
557
558                 case ARG_KEEP_UNIT:
559                         arg_keep_unit = true;
560                         break;
561
562                 case ARG_PERSONALITY:
563
564                         arg_personality = personality_from_string(optarg);
565                         if (arg_personality == 0xffffffffLU) {
566                                 log_error("Unknown or unsupported personality '%s'.", optarg);
567                                 return -EINVAL;
568                         }
569
570                         break;
571
572                 case ARG_VOLATILE:
573
574                         if (!optarg)
575                                 arg_volatile = VOLATILE_YES;
576                         else {
577                                 r = parse_boolean(optarg);
578                                 if (r < 0) {
579                                         if (streq(optarg, "state"))
580                                                 arg_volatile = VOLATILE_STATE;
581                                         else {
582                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
583                                                 return r;
584                                         }
585                                 } else
586                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
587                         }
588
589                         break;
590
591                 case '?':
592                         return -EINVAL;
593
594                 default:
595                         assert_not_reached("Unhandled option");
596                 }
597         }
598
599         if (arg_share_system)
600                 arg_register = false;
601
602         if (arg_boot && arg_share_system) {
603                 log_error("--boot and --share-system may not be combined.");
604                 return -EINVAL;
605         }
606
607         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
608                 log_error("--keep-unit may not be used when invoked from a user session.");
609                 return -EINVAL;
610         }
611
612         if (arg_directory && arg_image) {
613                 log_error("--directory= and --image= may not be combined.");
614                 return -EINVAL;
615         }
616
617         if (arg_volatile != VOLATILE_NO && arg_read_only) {
618                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
619                 return -EINVAL;
620         }
621
622         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
623
624         return 1;
625 }
626
627 static int mount_all(const char *dest) {
628
629         typedef struct MountPoint {
630                 const char *what;
631                 const char *where;
632                 const char *type;
633                 const char *options;
634                 unsigned long flags;
635                 bool fatal;
636         } MountPoint;
637
638         static const MountPoint mount_table[] = {
639                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
640                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
641                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
642                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
643                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
644                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
645                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
646                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
647 #ifdef HAVE_SELINUX
648                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
649                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
650 #endif
651         };
652
653         unsigned k;
654         int r = 0;
655
656         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
657                 _cleanup_free_ char *where = NULL;
658 #ifdef HAVE_SELINUX
659                 _cleanup_free_ char *options = NULL;
660 #endif
661                 const char *o;
662                 int t;
663
664                 where = strjoin(dest, "/", mount_table[k].where, NULL);
665                 if (!where)
666                         return log_oom();
667
668                 t = path_is_mount_point(where, true);
669                 if (t < 0) {
670                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
671
672                         if (r == 0)
673                                 r = t;
674
675                         continue;
676                 }
677
678                 /* Skip this entry if it is not a remount. */
679                 if (mount_table[k].what && t > 0)
680                         continue;
681
682                 mkdir_p(where, 0755);
683
684 #ifdef HAVE_SELINUX
685                 if (arg_selinux_apifs_context &&
686                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
687                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
688                         if (!options)
689                                 return log_oom();
690
691                         o = options;
692                 } else
693 #endif
694                         o = mount_table[k].options;
695
696
697                 if (mount(mount_table[k].what,
698                           where,
699                           mount_table[k].type,
700                           mount_table[k].flags,
701                           o) < 0 &&
702                     mount_table[k].fatal) {
703
704                         log_error("mount(%s) failed: %m", where);
705
706                         if (r == 0)
707                                 r = -errno;
708                 }
709         }
710
711         return r;
712 }
713
714 static int mount_binds(const char *dest, char **l, bool ro) {
715         char **x, **y;
716
717         STRV_FOREACH_PAIR(x, y, l) {
718                 _cleanup_free_ char *where = NULL;
719                 struct stat source_st, dest_st;
720                 int r;
721
722                 if (stat(*x, &source_st) < 0) {
723                         log_error("Failed to stat %s: %m", *x);
724                         return -errno;
725                 }
726
727                 where = strappend(dest, *y);
728                 if (!where)
729                         return log_oom();
730
731                 r = stat(where, &dest_st);
732                 if (r == 0) {
733                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
734                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
735                                 return -EINVAL;
736                         }
737                 } else if (errno == ENOENT) {
738                         r = mkdir_parents_label(where, 0755);
739                         if (r < 0) {
740                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
741                                 return r;
742                         }
743                 } else {
744                         log_error("Failed to bind mount %s: %m", *x);
745                         return -errno;
746                 }
747
748                 /* Create the mount point, but be conservative -- refuse to create block
749                  * and char devices. */
750                 if (S_ISDIR(source_st.st_mode))
751                         mkdir_label(where, 0755);
752                 else if (S_ISFIFO(source_st.st_mode))
753                         mkfifo(where, 0644);
754                 else if (S_ISSOCK(source_st.st_mode))
755                         mknod(where, 0644 | S_IFSOCK, 0);
756                 else if (S_ISREG(source_st.st_mode))
757                         touch(where);
758                 else {
759                         log_error("Refusing to create mountpoint for file: %s", *x);
760                         return -ENOTSUP;
761                 }
762
763                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
764                         log_error("mount(%s) failed: %m", where);
765                         return -errno;
766                 }
767
768                 if (ro) {
769                         r = bind_remount_recursive(where, true);
770                         if (r < 0) {
771                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
772                                 return r;
773                         }
774                 }
775         }
776
777         return 0;
778 }
779
780 static int mount_tmpfs(const char *dest) {
781         char **i, **o;
782
783         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
784                 _cleanup_free_ char *where = NULL;
785
786                 where = strappend(dest, *i);
787                 if (!where)
788                         return log_oom();
789
790                 mkdir_label(where, 0755);
791
792                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
793                         log_error("tmpfs mount to %s failed: %m", where);
794                         return -errno;
795                 }
796         }
797
798         return 0;
799 }
800
801 static int setup_timezone(const char *dest) {
802         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
803         char *z, *y;
804         int r;
805
806         assert(dest);
807
808         /* Fix the timezone, if possible */
809         r = readlink_malloc("/etc/localtime", &p);
810         if (r < 0) {
811                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
812                 return 0;
813         }
814
815         z = path_startswith(p, "../usr/share/zoneinfo/");
816         if (!z)
817                 z = path_startswith(p, "/usr/share/zoneinfo/");
818         if (!z) {
819                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
820                 return 0;
821         }
822
823         where = strappend(dest, "/etc/localtime");
824         if (!where)
825                 return log_oom();
826
827         r = readlink_malloc(where, &q);
828         if (r >= 0) {
829                 y = path_startswith(q, "../usr/share/zoneinfo/");
830                 if (!y)
831                         y = path_startswith(q, "/usr/share/zoneinfo/");
832
833                 /* Already pointing to the right place? Then do nothing .. */
834                 if (y && streq(y, z))
835                         return 0;
836         }
837
838         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
839         if (!check)
840                 return log_oom();
841
842         if (access(check, F_OK) < 0) {
843                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
844                 return 0;
845         }
846
847         what = strappend("../usr/share/zoneinfo/", z);
848         if (!what)
849                 return log_oom();
850
851         mkdir_parents(where, 0755);
852         unlink(where);
853
854         if (symlink(what, where) < 0) {
855                 log_error("Failed to correct timezone of container: %m");
856                 return 0;
857         }
858
859         return 0;
860 }
861
862 static int setup_resolv_conf(const char *dest) {
863         _cleanup_free_ char *where = NULL;
864
865         assert(dest);
866
867         if (arg_private_network)
868                 return 0;
869
870         /* Fix resolv.conf, if possible */
871         where = strappend(dest, "/etc/resolv.conf");
872         if (!where)
873                 return log_oom();
874
875         /* We don't really care for the results of this really. If it
876          * fails, it fails, but meh... */
877         mkdir_parents(where, 0755);
878         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
879
880         return 0;
881 }
882
883 static int setup_volatile_state(const char *directory) {
884         const char *p;
885         int r;
886
887         assert(directory);
888
889         if (arg_volatile != VOLATILE_STATE)
890                 return 0;
891
892         /* --volatile=state means we simply overmount /var
893            with a tmpfs, and the rest read-only. */
894
895         r = bind_remount_recursive(directory, true);
896         if (r < 0) {
897                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
898                 return r;
899         }
900
901         p = strappenda(directory, "/var");
902         mkdir(p, 0755);
903
904         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
905                 log_error("Failed to mount tmpfs to /var: %m");
906                 return -errno;
907         }
908
909         return 0;
910 }
911
912 static int setup_volatile(const char *directory) {
913         bool tmpfs_mounted = false, bind_mounted = false;
914         char template[] = "/tmp/nspawn-volatile-XXXXXX";
915         const char *f, *t;
916         int r;
917
918         assert(directory);
919
920         if (arg_volatile != VOLATILE_YES)
921                 return 0;
922
923         /* --volatile=yes means we mount a tmpfs to the root dir, and
924            the original /usr to use inside it, and that read-only. */
925
926         if (!mkdtemp(template)) {
927                 log_error("Failed to create temporary directory: %m");
928                 return -errno;
929         }
930
931         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
932                 log_error("Failed to mount tmpfs for root directory: %m");
933                 r = -errno;
934                 goto fail;
935         }
936
937         tmpfs_mounted = true;
938
939         f = strappenda(directory, "/usr");
940         t = strappenda(template, "/usr");
941
942         mkdir(t, 0755);
943         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
944                 log_error("Failed to create /usr bind mount: %m");
945                 r = -errno;
946                 goto fail;
947         }
948
949         bind_mounted = true;
950
951         r = bind_remount_recursive(t, true);
952         if (r < 0) {
953                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
954                 goto fail;
955         }
956
957         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
958                 log_error("Failed to move root mount: %m");
959                 r = -errno;
960                 goto fail;
961         }
962
963         rmdir(template);
964
965         return 0;
966
967 fail:
968         if (bind_mounted)
969                 umount(t);
970         if (tmpfs_mounted)
971                 umount(template);
972         rmdir(template);
973         return r;
974 }
975
976 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
977
978         snprintf(s, 37,
979                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
980                  SD_ID128_FORMAT_VAL(id));
981
982         return s;
983 }
984
985 static int setup_boot_id(const char *dest) {
986         _cleanup_free_ char *from = NULL, *to = NULL;
987         sd_id128_t rnd = {};
988         char as_uuid[37];
989         int r;
990
991         assert(dest);
992
993         if (arg_share_system)
994                 return 0;
995
996         /* Generate a new randomized boot ID, so that each boot-up of
997          * the container gets a new one */
998
999         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1000         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1001         if (!from || !to)
1002                 return log_oom();
1003
1004         r = sd_id128_randomize(&rnd);
1005         if (r < 0) {
1006                 log_error("Failed to generate random boot id: %s", strerror(-r));
1007                 return r;
1008         }
1009
1010         id128_format_as_uuid(rnd, as_uuid);
1011
1012         r = write_string_file(from, as_uuid);
1013         if (r < 0) {
1014                 log_error("Failed to write boot id: %s", strerror(-r));
1015                 return r;
1016         }
1017
1018         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1019                 log_error("Failed to bind mount boot id: %m");
1020                 r = -errno;
1021         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1022                 log_warning("Failed to make boot id read-only: %m");
1023
1024         unlink(from);
1025         return r;
1026 }
1027
1028 static int copy_devnodes(const char *dest) {
1029
1030         static const char devnodes[] =
1031                 "null\0"
1032                 "zero\0"
1033                 "full\0"
1034                 "random\0"
1035                 "urandom\0"
1036                 "tty\0";
1037
1038         const char *d;
1039         int r = 0;
1040         _cleanup_umask_ mode_t u;
1041
1042         assert(dest);
1043
1044         u = umask(0000);
1045
1046         NULSTR_FOREACH(d, devnodes) {
1047                 _cleanup_free_ char *from = NULL, *to = NULL;
1048                 struct stat st;
1049
1050                 from = strappend("/dev/", d);
1051                 to = strjoin(dest, "/dev/", d, NULL);
1052                 if (!from || !to)
1053                         return log_oom();
1054
1055                 if (stat(from, &st) < 0) {
1056
1057                         if (errno != ENOENT) {
1058                                 log_error("Failed to stat %s: %m", from);
1059                                 return -errno;
1060                         }
1061
1062                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1063
1064                         log_error("%s is not a char or block device, cannot copy", from);
1065                         return -EIO;
1066
1067                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1068
1069                         log_error("mknod(%s) failed: %m", dest);
1070                         return  -errno;
1071                 }
1072         }
1073
1074         return r;
1075 }
1076
1077 static int setup_ptmx(const char *dest) {
1078         _cleanup_free_ char *p = NULL;
1079
1080         p = strappend(dest, "/dev/ptmx");
1081         if (!p)
1082                 return log_oom();
1083
1084         if (symlink("pts/ptmx", p) < 0) {
1085                 log_error("Failed to create /dev/ptmx symlink: %m");
1086                 return -errno;
1087         }
1088
1089         return 0;
1090 }
1091
1092 static int setup_dev_console(const char *dest, const char *console) {
1093         _cleanup_umask_ mode_t u;
1094         const char *to;
1095         struct stat st;
1096         int r;
1097
1098         assert(dest);
1099         assert(console);
1100
1101         u = umask(0000);
1102
1103         if (stat("/dev/null", &st) < 0) {
1104                 log_error("Failed to stat /dev/null: %m");
1105                 return -errno;
1106         }
1107
1108         r = chmod_and_chown(console, 0600, 0, 0);
1109         if (r < 0) {
1110                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1111                 return r;
1112         }
1113
1114         /* We need to bind mount the right tty to /dev/console since
1115          * ptys can only exist on pts file systems. To have something
1116          * to bind mount things on we create a device node first, and
1117          * use /dev/null for that since we the cgroups device policy
1118          * allows us to create that freely, while we cannot create
1119          * /dev/console. (Note that the major minor doesn't actually
1120          * matter here, since we mount it over anyway). */
1121
1122         to = strappenda(dest, "/dev/console");
1123         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1124                 log_error("mknod() for /dev/console failed: %m");
1125                 return -errno;
1126         }
1127
1128         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1129                 log_error("Bind mount for /dev/console failed: %m");
1130                 return -errno;
1131         }
1132
1133         return 0;
1134 }
1135
1136 static int setup_kmsg(const char *dest, int kmsg_socket) {
1137         _cleanup_free_ char *from = NULL, *to = NULL;
1138         int r, fd, k;
1139         _cleanup_umask_ mode_t u;
1140         union {
1141                 struct cmsghdr cmsghdr;
1142                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1143         } control = {};
1144         struct msghdr mh = {
1145                 .msg_control = &control,
1146                 .msg_controllen = sizeof(control),
1147         };
1148         struct cmsghdr *cmsg;
1149
1150         assert(dest);
1151         assert(kmsg_socket >= 0);
1152
1153         u = umask(0000);
1154
1155         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1156          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1157          * on the reading side behave very similar to /proc/kmsg,
1158          * their writing side behaves differently from /dev/kmsg in
1159          * that writing blocks when nothing is reading. In order to
1160          * avoid any problems with containers deadlocking due to this
1161          * we simply make /dev/kmsg unavailable to the container. */
1162         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1163             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1164                 return log_oom();
1165
1166         if (mkfifo(from, 0600) < 0) {
1167                 log_error("mkfifo() for /dev/kmsg failed: %m");
1168                 return -errno;
1169         }
1170
1171         r = chmod_and_chown(from, 0600, 0, 0);
1172         if (r < 0) {
1173                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1174                 return r;
1175         }
1176
1177         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1178                 log_error("Bind mount for /proc/kmsg failed: %m");
1179                 return -errno;
1180         }
1181
1182         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1183         if (fd < 0) {
1184                 log_error("Failed to open fifo: %m");
1185                 return -errno;
1186         }
1187
1188         cmsg = CMSG_FIRSTHDR(&mh);
1189         cmsg->cmsg_level = SOL_SOCKET;
1190         cmsg->cmsg_type = SCM_RIGHTS;
1191         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1192         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1193
1194         mh.msg_controllen = cmsg->cmsg_len;
1195
1196         /* Store away the fd in the socket, so that it stays open as
1197          * long as we run the child */
1198         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1199         safe_close(fd);
1200
1201         if (k < 0) {
1202                 log_error("Failed to send FIFO fd: %m");
1203                 return -errno;
1204         }
1205
1206         /* And now make the FIFO unavailable as /dev/kmsg... */
1207         unlink(from);
1208         return 0;
1209 }
1210
1211 static int setup_hostname(void) {
1212
1213         if (arg_share_system)
1214                 return 0;
1215
1216         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1217                 return -errno;
1218
1219         return 0;
1220 }
1221
1222 static int setup_journal(const char *directory) {
1223         sd_id128_t machine_id, this_id;
1224         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1225         char *id;
1226         int r;
1227
1228         p = strappend(directory, "/etc/machine-id");
1229         if (!p)
1230                 return log_oom();
1231
1232         r = read_one_line_file(p, &b);
1233         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1234                 return 0;
1235         else if (r < 0) {
1236                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1237                 return r;
1238         }
1239
1240         id = strstrip(b);
1241         if (isempty(id) && arg_link_journal == LINK_AUTO)
1242                 return 0;
1243
1244         /* Verify validity */
1245         r = sd_id128_from_string(id, &machine_id);
1246         if (r < 0) {
1247                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1248                 return r;
1249         }
1250
1251         r = sd_id128_get_machine(&this_id);
1252         if (r < 0) {
1253                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1254                 return r;
1255         }
1256
1257         if (sd_id128_equal(machine_id, this_id)) {
1258                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1259                          "Host and machine ids are equal (%s): refusing to link journals", id);
1260                 if (arg_link_journal == LINK_AUTO)
1261                         return 0;
1262                 return
1263                         -EEXIST;
1264         }
1265
1266         if (arg_link_journal == LINK_NO)
1267                 return 0;
1268
1269         free(p);
1270         p = strappend("/var/log/journal/", id);
1271         q = strjoin(directory, "/var/log/journal/", id, NULL);
1272         if (!p || !q)
1273                 return log_oom();
1274
1275         if (path_is_mount_point(p, false) > 0) {
1276                 if (arg_link_journal != LINK_AUTO) {
1277                         log_error("%s: already a mount point, refusing to use for journal", p);
1278                         return -EEXIST;
1279                 }
1280
1281                 return 0;
1282         }
1283
1284         if (path_is_mount_point(q, false) > 0) {
1285                 if (arg_link_journal != LINK_AUTO) {
1286                         log_error("%s: already a mount point, refusing to use for journal", q);
1287                         return -EEXIST;
1288                 }
1289
1290                 return 0;
1291         }
1292
1293         r = readlink_and_make_absolute(p, &d);
1294         if (r >= 0) {
1295                 if ((arg_link_journal == LINK_GUEST ||
1296                      arg_link_journal == LINK_AUTO) &&
1297                     path_equal(d, q)) {
1298
1299                         r = mkdir_p(q, 0755);
1300                         if (r < 0)
1301                                 log_warning("failed to create directory %s: %m", q);
1302                         return 0;
1303                 }
1304
1305                 if (unlink(p) < 0) {
1306                         log_error("Failed to remove symlink %s: %m", p);
1307                         return -errno;
1308                 }
1309         } else if (r == -EINVAL) {
1310
1311                 if (arg_link_journal == LINK_GUEST &&
1312                     rmdir(p) < 0) {
1313
1314                         if (errno == ENOTDIR) {
1315                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1316                                 return r;
1317                         } else {
1318                                 log_error("Failed to remove %s: %m", p);
1319                                 return -errno;
1320                         }
1321                 }
1322         } else if (r != -ENOENT) {
1323                 log_error("readlink(%s) failed: %m", p);
1324                 return r;
1325         }
1326
1327         if (arg_link_journal == LINK_GUEST) {
1328
1329                 if (symlink(q, p) < 0) {
1330                         log_error("Failed to symlink %s to %s: %m", q, p);
1331                         return -errno;
1332                 }
1333
1334                 r = mkdir_p(q, 0755);
1335                 if (r < 0)
1336                         log_warning("failed to create directory %s: %m", q);
1337                 return 0;
1338         }
1339
1340         if (arg_link_journal == LINK_HOST) {
1341                 r = mkdir_p(p, 0755);
1342                 if (r < 0) {
1343                         log_error("Failed to create %s: %m", p);
1344                         return r;
1345                 }
1346
1347         } else if (access(p, F_OK) < 0)
1348                 return 0;
1349
1350         if (dir_is_empty(q) == 0)
1351                 log_warning("%s is not empty, proceeding anyway.", q);
1352
1353         r = mkdir_p(q, 0755);
1354         if (r < 0) {
1355                 log_error("Failed to create %s: %m", q);
1356                 return r;
1357         }
1358
1359         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1360                 log_error("Failed to bind mount journal from host into guest: %m");
1361                 return -errno;
1362         }
1363
1364         return 0;
1365 }
1366
1367 static int setup_kdbus(const char *dest, const char *path) {
1368         const char *p;
1369
1370         if (!path)
1371                 return 0;
1372
1373         p = strappenda(dest, "/dev/kdbus");
1374         if (mkdir(p, 0755) < 0) {
1375                 log_error("Failed to create kdbus path: %m");
1376                 return  -errno;
1377         }
1378
1379         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1380                 log_error("Failed to mount kdbus domain path: %m");
1381                 return -errno;
1382         }
1383
1384         return 0;
1385 }
1386
1387 static int drop_capabilities(void) {
1388         return capability_bounding_set_drop(~arg_retain, false);
1389 }
1390
1391 static int register_machine(pid_t pid) {
1392         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1393         _cleanup_bus_unref_ sd_bus *bus = NULL;
1394         int r;
1395
1396         if (!arg_register)
1397                 return 0;
1398
1399         r = sd_bus_default_system(&bus);
1400         if (r < 0) {
1401                 log_error("Failed to open system bus: %s", strerror(-r));
1402                 return r;
1403         }
1404
1405         if (arg_keep_unit) {
1406                 r = sd_bus_call_method(
1407                                 bus,
1408                                 "org.freedesktop.machine1",
1409                                 "/org/freedesktop/machine1",
1410                                 "org.freedesktop.machine1.Manager",
1411                                 "RegisterMachine",
1412                                 &error,
1413                                 NULL,
1414                                 "sayssus",
1415                                 arg_machine,
1416                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1417                                 "nspawn",
1418                                 "container",
1419                                 (uint32_t) pid,
1420                                 strempty(arg_directory));
1421         } else {
1422                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1423
1424                 r = sd_bus_message_new_method_call(
1425                                 bus,
1426                                 &m,
1427                                 "org.freedesktop.machine1",
1428                                 "/org/freedesktop/machine1",
1429                                 "org.freedesktop.machine1.Manager",
1430                                 "CreateMachine");
1431                 if (r < 0) {
1432                         log_error("Failed to create message: %s", strerror(-r));
1433                         return r;
1434                 }
1435
1436                 r = sd_bus_message_append(
1437                                 m,
1438                                 "sayssus",
1439                                 arg_machine,
1440                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1441                                 "nspawn",
1442                                 "container",
1443                                 (uint32_t) pid,
1444                                 strempty(arg_directory));
1445                 if (r < 0) {
1446                         log_error("Failed to append message arguments: %s", strerror(-r));
1447                         return r;
1448                 }
1449
1450                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1451                 if (r < 0) {
1452                         log_error("Failed to open container: %s", strerror(-r));
1453                         return r;
1454                 }
1455
1456                 if (!isempty(arg_slice)) {
1457                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1458                         if (r < 0) {
1459                                 log_error("Failed to append slice: %s", strerror(-r));
1460                                 return r;
1461                         }
1462                 }
1463
1464                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1465                 if (r < 0) {
1466                         log_error("Failed to add device policy: %s", strerror(-r));
1467                         return r;
1468                 }
1469
1470                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1471                                           /* Allow the container to
1472                                            * access and create the API
1473                                            * device nodes, so that
1474                                            * PrivateDevices= in the
1475                                            * container can work
1476                                            * fine */
1477                                           "/dev/null", "rwm",
1478                                           "/dev/zero", "rwm",
1479                                           "/dev/full", "rwm",
1480                                           "/dev/random", "rwm",
1481                                           "/dev/urandom", "rwm",
1482                                           "/dev/tty", "rwm",
1483                                           /* Allow the container
1484                                            * access to ptys. However,
1485                                            * do not permit the
1486                                            * container to ever create
1487                                            * these device nodes. */
1488                                           "/dev/pts/ptmx", "rw",
1489                                           "char-pts", "rw",
1490                                           /* Allow the container
1491                                            * access to all kdbus
1492                                            * devices. Again, the
1493                                            * container cannot create
1494                                            * these nodes, only use
1495                                            * them. We use a pretty
1496                                            * open match here, so that
1497                                            * the kernel API can still
1498                                            * change. */
1499                                           "char-kdbus", "rw",
1500                                           "char-kdbus/*", "rw");
1501                 if (r < 0) {
1502                         log_error("Failed to add device whitelist: %s", strerror(-r));
1503                         return r;
1504                 }
1505
1506                 r = sd_bus_message_close_container(m);
1507                 if (r < 0) {
1508                         log_error("Failed to close container: %s", strerror(-r));
1509                         return r;
1510                 }
1511
1512                 r = sd_bus_call(bus, m, 0, &error, NULL);
1513         }
1514
1515         if (r < 0) {
1516                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1517                 return r;
1518         }
1519
1520         return 0;
1521 }
1522
1523 static int terminate_machine(pid_t pid) {
1524         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1525         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1526         _cleanup_bus_unref_ sd_bus *bus = NULL;
1527         const char *path;
1528         int r;
1529
1530         if (!arg_register)
1531                 return 0;
1532
1533         r = sd_bus_default_system(&bus);
1534         if (r < 0) {
1535                 log_error("Failed to open system bus: %s", strerror(-r));
1536                 return r;
1537         }
1538
1539         r = sd_bus_call_method(
1540                         bus,
1541                         "org.freedesktop.machine1",
1542                         "/org/freedesktop/machine1",
1543                         "org.freedesktop.machine1.Manager",
1544                         "GetMachineByPID",
1545                         &error,
1546                         &reply,
1547                         "u",
1548                         (uint32_t) pid);
1549         if (r < 0) {
1550                 /* Note that the machine might already have been
1551                  * cleaned up automatically, hence don't consider it a
1552                  * failure if we cannot get the machine object. */
1553                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1554                 return 0;
1555         }
1556
1557         r = sd_bus_message_read(reply, "o", &path);
1558         if (r < 0)
1559                 return bus_log_parse_error(r);
1560
1561         r = sd_bus_call_method(
1562                         bus,
1563                         "org.freedesktop.machine1",
1564                         path,
1565                         "org.freedesktop.machine1.Machine",
1566                         "Terminate",
1567                         &error,
1568                         NULL,
1569                         NULL);
1570         if (r < 0) {
1571                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1572                 return 0;
1573         }
1574
1575         return 0;
1576 }
1577
1578 static int reset_audit_loginuid(void) {
1579         _cleanup_free_ char *p = NULL;
1580         int r;
1581
1582         if (arg_share_system)
1583                 return 0;
1584
1585         r = read_one_line_file("/proc/self/loginuid", &p);
1586         if (r == -ENOENT)
1587                 return 0;
1588         if (r < 0) {
1589                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1590                 return r;
1591         }
1592
1593         /* Already reset? */
1594         if (streq(p, "4294967295"))
1595                 return 0;
1596
1597         r = write_string_file("/proc/self/loginuid", "4294967295");
1598         if (r < 0) {
1599                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1600                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1601                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1602                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1603                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1604
1605                 sleep(5);
1606         }
1607
1608         return 0;
1609 }
1610
1611 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1612
1613 static int get_mac(struct ether_addr *mac) {
1614         int r;
1615
1616         uint8_t result[8];
1617         size_t l, sz;
1618         uint8_t *v;
1619
1620         l = strlen(arg_machine);
1621         sz = sizeof(sd_id128_t) + l;
1622         v = alloca(sz);
1623
1624         /* fetch some persistent data unique to the host */
1625         r = sd_id128_get_machine((sd_id128_t*) v);
1626         if (r < 0)
1627                 return r;
1628
1629         /* combine with some data unique (on this host) to this
1630          * container instance */
1631         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1632
1633         /* Let's hash the host machine ID plus the container name. We
1634          * use a fixed, but originally randomly created hash key here. */
1635         siphash24(result, v, sz, HASH_KEY.bytes);
1636
1637         assert_cc(ETH_ALEN <= sizeof(result));
1638         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1639
1640         /* see eth_random_addr in the kernel */
1641         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1642         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1643
1644         return 0;
1645 }
1646
1647 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1648         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1649         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1650         struct ether_addr mac;
1651         int r;
1652
1653         if (!arg_private_network)
1654                 return 0;
1655
1656         if (!arg_network_veth)
1657                 return 0;
1658
1659         /* Use two different interface name prefixes depending whether
1660          * we are in bridge mode or not. */
1661         if (arg_network_bridge)
1662                 memcpy(iface_name, "vb-", 3);
1663         else
1664                 memcpy(iface_name, "ve-", 3);
1665         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1666
1667         r = get_mac(&mac);
1668         if (r < 0) {
1669                 log_error("Failed to generate predictable MAC address for host0");
1670                 return r;
1671         }
1672
1673         r = sd_rtnl_open(&rtnl, 0);
1674         if (r < 0) {
1675                 log_error("Failed to connect to netlink: %s", strerror(-r));
1676                 return r;
1677         }
1678
1679         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1680         if (r < 0) {
1681                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1682                 return r;
1683         }
1684
1685         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1686         if (r < 0) {
1687                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1688                 return r;
1689         }
1690
1691         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1692         if (r < 0) {
1693                 log_error("Failed to open netlink container: %s", strerror(-r));
1694                 return r;
1695         }
1696
1697         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1698         if (r < 0) {
1699                 log_error("Failed to open netlink container: %s", strerror(-r));
1700                 return r;
1701         }
1702
1703         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1704         if (r < 0) {
1705                 log_error("Failed to open netlink container: %s", strerror(-r));
1706                 return r;
1707         }
1708
1709         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1710         if (r < 0) {
1711                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1712                 return r;
1713         }
1714
1715         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1716         if (r < 0) {
1717                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1718                 return r;
1719         }
1720
1721         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1722         if (r < 0) {
1723                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1724                 return r;
1725         }
1726
1727         r = sd_rtnl_message_close_container(m);
1728         if (r < 0) {
1729                 log_error("Failed to close netlink container: %s", strerror(-r));
1730                 return r;
1731         }
1732
1733         r = sd_rtnl_message_close_container(m);
1734         if (r < 0) {
1735                 log_error("Failed to close netlink container: %s", strerror(-r));
1736                 return r;
1737         }
1738
1739         r = sd_rtnl_message_close_container(m);
1740         if (r < 0) {
1741                 log_error("Failed to close netlink container: %s", strerror(-r));
1742                 return r;
1743         }
1744
1745         r = sd_rtnl_call(rtnl, m, 0, NULL);
1746         if (r < 0) {
1747                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1748                 return r;
1749         }
1750
1751         return 0;
1752 }
1753
1754 static int setup_bridge(const char veth_name[]) {
1755         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1756         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1757         int r, bridge;
1758
1759         if (!arg_private_network)
1760                 return 0;
1761
1762         if (!arg_network_veth)
1763                 return 0;
1764
1765         if (!arg_network_bridge)
1766                 return 0;
1767
1768         bridge = (int) if_nametoindex(arg_network_bridge);
1769         if (bridge <= 0) {
1770                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1771                 return -errno;
1772         }
1773
1774         r = sd_rtnl_open(&rtnl, 0);
1775         if (r < 0) {
1776                 log_error("Failed to connect to netlink: %s", strerror(-r));
1777                 return r;
1778         }
1779
1780         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1781         if (r < 0) {
1782                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1783                 return r;
1784         }
1785
1786         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1787         if (r < 0) {
1788                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1789                 return r;
1790         }
1791
1792         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1793         if (r < 0) {
1794                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1795                 return r;
1796         }
1797
1798         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1799         if (r < 0) {
1800                 log_error("Failed to add netlink master field: %s", strerror(-r));
1801                 return r;
1802         }
1803
1804         r = sd_rtnl_call(rtnl, m, 0, NULL);
1805         if (r < 0) {
1806                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1807                 return r;
1808         }
1809
1810         return 0;
1811 }
1812
1813 static int parse_interface(struct udev *udev, const char *name) {
1814         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1815         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1816         int ifi;
1817
1818         ifi = (int) if_nametoindex(name);
1819         if (ifi <= 0) {
1820                 log_error("Failed to resolve interface %s: %m", name);
1821                 return -errno;
1822         }
1823
1824         sprintf(ifi_str, "n%i", ifi);
1825         d = udev_device_new_from_device_id(udev, ifi_str);
1826         if (!d) {
1827                 log_error("Failed to get udev device for interface %s: %m", name);
1828                 return -errno;
1829         }
1830
1831         if (udev_device_get_is_initialized(d) <= 0) {
1832                 log_error("Network interface %s is not initialized yet.", name);
1833                 return -EBUSY;
1834         }
1835
1836         return ifi;
1837 }
1838
1839 static int move_network_interfaces(pid_t pid) {
1840         _cleanup_udev_unref_ struct udev *udev = NULL;
1841         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1842         char **i;
1843         int r;
1844
1845         if (!arg_private_network)
1846                 return 0;
1847
1848         if (strv_isempty(arg_network_interfaces))
1849                 return 0;
1850
1851         r = sd_rtnl_open(&rtnl, 0);
1852         if (r < 0) {
1853                 log_error("Failed to connect to netlink: %s", strerror(-r));
1854                 return r;
1855         }
1856
1857         udev = udev_new();
1858         if (!udev) {
1859                 log_error("Failed to connect to udev.");
1860                 return -ENOMEM;
1861         }
1862
1863         STRV_FOREACH(i, arg_network_interfaces) {
1864                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1865                 int ifi;
1866
1867                 ifi = parse_interface(udev, *i);
1868                 if (ifi < 0)
1869                         return ifi;
1870
1871                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1872                 if (r < 0) {
1873                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1874                         return r;
1875                 }
1876
1877                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1878                 if (r < 0) {
1879                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1880                         return r;
1881                 }
1882
1883                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1884                 if (r < 0) {
1885                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1886                         return r;
1887                 }
1888         }
1889
1890         return 0;
1891 }
1892
1893 static int setup_macvlan(pid_t pid) {
1894         _cleanup_udev_unref_ struct udev *udev = NULL;
1895         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1896         char **i;
1897         int r;
1898
1899         if (!arg_private_network)
1900                 return 0;
1901
1902         if (strv_isempty(arg_network_macvlan))
1903                 return 0;
1904
1905         r = sd_rtnl_open(&rtnl, 0);
1906         if (r < 0) {
1907                 log_error("Failed to connect to netlink: %s", strerror(-r));
1908                 return r;
1909         }
1910
1911         udev = udev_new();
1912         if (!udev) {
1913                 log_error("Failed to connect to udev.");
1914                 return -ENOMEM;
1915         }
1916
1917         STRV_FOREACH(i, arg_network_macvlan) {
1918                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1919                 _cleanup_free_ char *n = NULL;
1920                 int ifi;
1921
1922                 ifi = parse_interface(udev, *i);
1923                 if (ifi < 0)
1924                         return ifi;
1925
1926                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1927                 if (r < 0) {
1928                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1929                         return r;
1930                 }
1931
1932                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1933                 if (r < 0) {
1934                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1935                         return r;
1936                 }
1937
1938                 n = strappend("mv-", *i);
1939                 if (!n)
1940                         return log_oom();
1941
1942                 strshorten(n, IFNAMSIZ-1);
1943
1944                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1945                 if (r < 0) {
1946                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1947                         return r;
1948                 }
1949
1950                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1951                 if (r < 0) {
1952                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1953                         return r;
1954                 }
1955
1956                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1957                 if (r < 0) {
1958                         log_error("Failed to open netlink container: %s", strerror(-r));
1959                         return r;
1960                 }
1961
1962                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1963                 if (r < 0) {
1964                         log_error("Failed to open netlink container: %s", strerror(-r));
1965                         return r;
1966                 }
1967
1968                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1969                 if (r < 0) {
1970                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1971                         return r;
1972                 }
1973
1974                 r = sd_rtnl_message_close_container(m);
1975                 if (r < 0) {
1976                         log_error("Failed to close netlink container: %s", strerror(-r));
1977                         return r;
1978                 }
1979
1980                 r = sd_rtnl_message_close_container(m);
1981                 if (r < 0) {
1982                         log_error("Failed to close netlink container: %s", strerror(-r));
1983                         return r;
1984                 }
1985
1986                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1987                 if (r < 0) {
1988                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1989                         return r;
1990                 }
1991         }
1992
1993         return 0;
1994 }
1995
1996 static int setup_seccomp(void) {
1997
1998 #ifdef HAVE_SECCOMP
1999         static const int blacklist[] = {
2000                 SCMP_SYS(kexec_load),
2001                 SCMP_SYS(open_by_handle_at),
2002                 SCMP_SYS(init_module),
2003                 SCMP_SYS(finit_module),
2004                 SCMP_SYS(delete_module),
2005                 SCMP_SYS(iopl),
2006                 SCMP_SYS(ioperm),
2007                 SCMP_SYS(swapon),
2008                 SCMP_SYS(swapoff),
2009         };
2010
2011         scmp_filter_ctx seccomp;
2012         unsigned i;
2013         int r;
2014
2015         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2016         if (!seccomp)
2017                 return log_oom();
2018
2019         r = seccomp_add_secondary_archs(seccomp);
2020         if (r < 0) {
2021                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2022                 goto finish;
2023         }
2024
2025         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2026                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2027                 if (r == -EFAULT)
2028                         continue; /* unknown syscall */
2029                 if (r < 0) {
2030                         log_error("Failed to block syscall: %s", strerror(-r));
2031                         goto finish;
2032                 }
2033         }
2034
2035         /*
2036            Audit is broken in containers, much of the userspace audit
2037            hookup will fail if running inside a container. We don't
2038            care and just turn off creation of audit sockets.
2039
2040            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2041            with EAFNOSUPPORT which audit userspace uses as indication
2042            that audit is disabled in the kernel.
2043          */
2044
2045         r = seccomp_rule_add(
2046                         seccomp,
2047                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2048                         SCMP_SYS(socket),
2049                         2,
2050                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2051                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2052         if (r < 0) {
2053                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2054                 goto finish;
2055         }
2056
2057         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2058         if (r < 0) {
2059                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2060                 goto finish;
2061         }
2062
2063         r = seccomp_load(seccomp);
2064         if (r < 0)
2065                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2066
2067 finish:
2068         seccomp_release(seccomp);
2069         return r;
2070 #else
2071         return 0;
2072 #endif
2073
2074 }
2075
2076 static int setup_image(char **device_path, int *loop_nr) {
2077         struct loop_info64 info = {
2078                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2079         };
2080         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2081         _cleanup_free_ char* loopdev = NULL;
2082         struct stat st;
2083         int r, nr;
2084
2085         assert(device_path);
2086         assert(loop_nr);
2087
2088         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2089         if (fd < 0) {
2090                 log_error("Failed to open %s: %m", arg_image);
2091                 return -errno;
2092         }
2093
2094         if (fstat(fd, &st) < 0) {
2095                 log_error("Failed to stat %s: %m", arg_image);
2096                 return -errno;
2097         }
2098
2099         if (S_ISBLK(st.st_mode)) {
2100                 char *p;
2101
2102                 p = strdup(arg_image);
2103                 if (!p)
2104                         return log_oom();
2105
2106                 *device_path = p;
2107
2108                 *loop_nr = -1;
2109
2110                 r = fd;
2111                 fd = -1;
2112
2113                 return r;
2114         }
2115
2116         if (!S_ISREG(st.st_mode)) {
2117                 log_error("%s is not a regular file or block device: %m", arg_image);
2118                 return -EINVAL;
2119         }
2120
2121         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2122         if (control < 0) {
2123                 log_error("Failed to open /dev/loop-control: %m");
2124                 return -errno;
2125         }
2126
2127         nr = ioctl(control, LOOP_CTL_GET_FREE);
2128         if (nr < 0) {
2129                 log_error("Failed to allocate loop device: %m");
2130                 return -errno;
2131         }
2132
2133         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2134                 return log_oom();
2135
2136         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2137         if (loop < 0) {
2138                 log_error("Failed to open loop device %s: %m", loopdev);
2139                 return -errno;
2140         }
2141
2142         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2143                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2144                 return -errno;
2145         }
2146
2147         if (arg_read_only)
2148                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2149
2150         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2151                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2152                 return -errno;
2153         }
2154
2155         *device_path = loopdev;
2156         loopdev = NULL;
2157
2158         *loop_nr = nr;
2159
2160         r = loop;
2161         loop = -1;
2162
2163         return r;
2164 }
2165
2166 static int dissect_image(
2167                 int fd,
2168                 char **root_device, bool *root_device_rw,
2169                 char **home_device, bool *home_device_rw,
2170                 char **srv_device, bool *srv_device_rw,
2171                 bool *secondary) {
2172
2173 #ifdef HAVE_BLKID
2174         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2175         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2176         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2177         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2178         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2179         _cleanup_udev_unref_ struct udev *udev = NULL;
2180         struct udev_list_entry *first, *item;
2181         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2182         const char *pttype = NULL;
2183         blkid_partlist pl;
2184         struct stat st;
2185         int r;
2186
2187         assert(fd >= 0);
2188         assert(root_device);
2189         assert(home_device);
2190         assert(srv_device);
2191         assert(secondary);
2192
2193         b = blkid_new_probe();
2194         if (!b)
2195                 return log_oom();
2196
2197         errno = 0;
2198         r = blkid_probe_set_device(b, fd, 0, 0);
2199         if (r != 0) {
2200                 if (errno == 0)
2201                         return log_oom();
2202
2203                 log_error("Failed to set device on blkid probe: %m");
2204                 return -errno;
2205         }
2206
2207         blkid_probe_enable_partitions(b, 1);
2208         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2209
2210         errno = 0;
2211         r = blkid_do_safeprobe(b);
2212         if (r == -2 || r == 1) {
2213                 log_error("Failed to identify any partition table on %s.\n"
2214                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2215                 return -EINVAL;
2216         } else if (r != 0) {
2217                 if (errno == 0)
2218                         errno = EIO;
2219                 log_error("Failed to probe: %m");
2220                 return -errno;
2221         }
2222
2223         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2224         if (!streq_ptr(pttype, "gpt")) {
2225                 log_error("Image %s does not carry a GUID Partition Table.\n"
2226                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2227                 return -EINVAL;
2228         }
2229
2230         errno = 0;
2231         pl = blkid_probe_get_partitions(b);
2232         if (!pl) {
2233                 if (errno == 0)
2234                         return log_oom();
2235
2236                 log_error("Failed to list partitions of %s", arg_image);
2237                 return -errno;
2238         }
2239
2240         udev = udev_new();
2241         if (!udev)
2242                 return log_oom();
2243
2244         if (fstat(fd, &st) < 0) {
2245                 log_error("Failed to stat block device: %m");
2246                 return -errno;
2247         }
2248
2249         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2250         if (!d)
2251                 return log_oom();
2252
2253         e = udev_enumerate_new(udev);
2254         if (!e)
2255                 return log_oom();
2256
2257         r = udev_enumerate_add_match_parent(e, d);
2258         if (r < 0)
2259                 return log_oom();
2260
2261         r = udev_enumerate_scan_devices(e);
2262         if (r < 0) {
2263                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2264                 return r;
2265         }
2266
2267         first = udev_enumerate_get_list_entry(e);
2268         udev_list_entry_foreach(item, first) {
2269                 _cleanup_udev_device_unref_ struct udev_device *q;
2270                 const char *stype, *node;
2271                 unsigned long long flags;
2272                 sd_id128_t type_id;
2273                 blkid_partition pp;
2274                 dev_t qn;
2275                 int nr;
2276
2277                 errno = 0;
2278                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2279                 if (!q) {
2280                         if (!errno)
2281                                 errno = ENOMEM;
2282
2283                         log_error("Failed to get partition device of %s: %m", arg_image);
2284                         return -errno;
2285                 }
2286
2287                 qn = udev_device_get_devnum(q);
2288                 if (major(qn) == 0)
2289                         continue;
2290
2291                 if (st.st_rdev == qn)
2292                         continue;
2293
2294                 node = udev_device_get_devnode(q);
2295                 if (!node)
2296                         continue;
2297
2298                 pp = blkid_partlist_devno_to_partition(pl, qn);
2299                 if (!pp)
2300                         continue;
2301
2302                 flags = blkid_partition_get_flags(pp);
2303                 if (flags & GPT_FLAG_NO_AUTO)
2304                         continue;
2305
2306                 nr = blkid_partition_get_partno(pp);
2307                 if (nr < 0)
2308                         continue;
2309
2310                 stype = blkid_partition_get_type_string(pp);
2311                 if (!stype)
2312                         continue;
2313
2314                 if (sd_id128_from_string(stype, &type_id) < 0)
2315                         continue;
2316
2317                 if (sd_id128_equal(type_id, GPT_HOME)) {
2318
2319                         if (home && nr >= home_nr)
2320                                 continue;
2321
2322                         home_nr = nr;
2323                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2324
2325                         free(home);
2326                         home = strdup(node);
2327                         if (!home)
2328                                 return log_oom();
2329                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2330
2331                         if (srv && nr >= srv_nr)
2332                                 continue;
2333
2334                         srv_nr = nr;
2335                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2336
2337                         free(srv);
2338                         srv = strdup(node);
2339                         if (!srv)
2340                                 return log_oom();
2341                 }
2342 #ifdef GPT_ROOT_NATIVE
2343                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2344
2345                         if (root && nr >= root_nr)
2346                                 continue;
2347
2348                         root_nr = nr;
2349                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2350
2351                         free(root);
2352                         root = strdup(node);
2353                         if (!root)
2354                                 return log_oom();
2355                 }
2356 #endif
2357 #ifdef GPT_ROOT_SECONDARY
2358                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2359
2360                         if (secondary_root && nr >= secondary_root_nr)
2361                                 continue;
2362
2363                         secondary_root_nr = nr;
2364                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2365
2366
2367                         free(secondary_root);
2368                         secondary_root = strdup(node);
2369                         if (!secondary_root)
2370                                 return log_oom();
2371                 }
2372 #endif
2373         }
2374
2375         if (!root && !secondary_root) {
2376                 log_error("Failed to identify root partition in disk image %s.\n"
2377                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2378                 return -EINVAL;
2379         }
2380
2381         if (root) {
2382                 *root_device = root;
2383                 root = NULL;
2384
2385                 *root_device_rw = root_rw;
2386                 *secondary = false;
2387         } else if (secondary_root) {
2388                 *root_device = secondary_root;
2389                 secondary_root = NULL;
2390
2391                 *root_device_rw = secondary_root_rw;
2392                 *secondary = true;
2393         }
2394
2395         if (home) {
2396                 *home_device = home;
2397                 home = NULL;
2398
2399                 *home_device_rw = home_rw;
2400         }
2401
2402         if (srv) {
2403                 *srv_device = srv;
2404                 srv = NULL;
2405
2406                 *srv_device_rw = srv_rw;
2407         }
2408
2409         return 0;
2410 #else
2411         log_error("--image= is not supported, compiled without blkid support.");
2412         return -ENOTSUP;
2413 #endif
2414 }
2415
2416 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2417 #ifdef HAVE_BLKID
2418         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2419         const char *fstype, *p;
2420         int r;
2421
2422         assert(what);
2423         assert(where);
2424
2425         if (arg_read_only)
2426                 rw = false;
2427
2428         if (directory)
2429                 p = strappenda(where, directory);
2430         else
2431                 p = where;
2432
2433         errno = 0;
2434         b = blkid_new_probe_from_filename(what);
2435         if (!b) {
2436                 if (errno == 0)
2437                         return log_oom();
2438                 log_error("Failed to allocate prober for %s: %m", what);
2439                 return -errno;
2440         }
2441
2442         blkid_probe_enable_superblocks(b, 1);
2443         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2444
2445         errno = 0;
2446         r = blkid_do_safeprobe(b);
2447         if (r == -1 || r == 1) {
2448                 log_error("Cannot determine file system type of %s", what);
2449                 return -EINVAL;
2450         } else if (r != 0) {
2451                 if (errno == 0)
2452                         errno = EIO;
2453                 log_error("Failed to probe %s: %m", what);
2454                 return -errno;
2455         }
2456
2457         errno = 0;
2458         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2459                 if (errno == 0)
2460                         errno = EINVAL;
2461                 log_error("Failed to determine file system type of %s", what);
2462                 return -errno;
2463         }
2464
2465         if (streq(fstype, "crypto_LUKS")) {
2466                 log_error("nspawn currently does not support LUKS disk images.");
2467                 return -ENOTSUP;
2468         }
2469
2470         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2471                 log_error("Failed to mount %s: %m", what);
2472                 return -errno;
2473         }
2474
2475         return 0;
2476 #else
2477         log_error("--image= is not supported, compiled without blkid support.");
2478         return -ENOTSUP;
2479 #endif
2480 }
2481
2482 static int mount_devices(
2483                 const char *where,
2484                 const char *root_device, bool root_device_rw,
2485                 const char *home_device, bool home_device_rw,
2486                 const char *srv_device, bool srv_device_rw) {
2487         int r;
2488
2489         assert(where);
2490
2491         if (root_device) {
2492                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2493                 if (r < 0) {
2494                         log_error("Failed to mount root directory: %s", strerror(-r));
2495                         return r;
2496                 }
2497         }
2498
2499         if (home_device) {
2500                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2501                 if (r < 0) {
2502                         log_error("Failed to mount home directory: %s", strerror(-r));
2503                         return r;
2504                 }
2505         }
2506
2507         if (srv_device) {
2508                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2509                 if (r < 0) {
2510                         log_error("Failed to mount server data directory: %s", strerror(-r));
2511                         return r;
2512                 }
2513         }
2514
2515         return 0;
2516 }
2517
2518 static void loop_remove(int nr, int *image_fd) {
2519         _cleanup_close_ int control = -1;
2520
2521         if (nr < 0)
2522                 return;
2523
2524         if (image_fd && *image_fd >= 0) {
2525                 ioctl(*image_fd, LOOP_CLR_FD);
2526                 *image_fd = safe_close(*image_fd);
2527         }
2528
2529         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2530         if (control < 0)
2531                 return;
2532
2533         ioctl(control, LOOP_CTL_REMOVE, nr);
2534 }
2535
2536 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2537         int pipe_fds[2];
2538         pid_t pid;
2539
2540         assert(database);
2541         assert(key);
2542         assert(rpid);
2543
2544         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2545                 log_error("Failed to allocate pipe: %m");
2546                 return -errno;
2547         }
2548
2549         pid = fork();
2550         if (pid < 0) {
2551                 log_error("Failed to fork getent child: %m");
2552                 return -errno;
2553         } else if (pid == 0) {
2554                 int nullfd;
2555                 char *empty_env = NULL;
2556
2557                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2558                         _exit(EXIT_FAILURE);
2559
2560                 if (pipe_fds[0] > 2)
2561                         safe_close(pipe_fds[0]);
2562                 if (pipe_fds[1] > 2)
2563                         safe_close(pipe_fds[1]);
2564
2565                 nullfd = open("/dev/null", O_RDWR);
2566                 if (nullfd < 0)
2567                         _exit(EXIT_FAILURE);
2568
2569                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2570                         _exit(EXIT_FAILURE);
2571
2572                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2573                         _exit(EXIT_FAILURE);
2574
2575                 if (nullfd > 2)
2576                         safe_close(nullfd);
2577
2578                 reset_all_signal_handlers();
2579                 close_all_fds(NULL, 0);
2580
2581                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2582                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2583                 _exit(EXIT_FAILURE);
2584         }
2585
2586         pipe_fds[1] = safe_close(pipe_fds[1]);
2587
2588         *rpid = pid;
2589
2590         return pipe_fds[0];
2591 }
2592
2593 static int change_uid_gid(char **_home) {
2594         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2595         _cleanup_free_ uid_t *uids = NULL;
2596         _cleanup_free_ char *home = NULL;
2597         _cleanup_fclose_ FILE *f = NULL;
2598         _cleanup_close_ int fd = -1;
2599         unsigned n_uids = 0;
2600         size_t sz = 0, l;
2601         uid_t uid;
2602         gid_t gid;
2603         pid_t pid;
2604         int r;
2605
2606         assert(_home);
2607
2608         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2609                 /* Reset everything fully to 0, just in case */
2610
2611                 if (setgroups(0, NULL) < 0) {
2612                         log_error("setgroups() failed: %m");
2613                         return -errno;
2614                 }
2615
2616                 if (setresgid(0, 0, 0) < 0) {
2617                         log_error("setregid() failed: %m");
2618                         return -errno;
2619                 }
2620
2621                 if (setresuid(0, 0, 0) < 0) {
2622                         log_error("setreuid() failed: %m");
2623                         return -errno;
2624                 }
2625
2626                 *_home = NULL;
2627                 return 0;
2628         }
2629
2630         /* First, get user credentials */
2631         fd = spawn_getent("passwd", arg_user, &pid);
2632         if (fd < 0)
2633                 return fd;
2634
2635         f = fdopen(fd, "r");
2636         if (!f)
2637                 return log_oom();
2638         fd = -1;
2639
2640         if (!fgets(line, sizeof(line), f)) {
2641
2642                 if (!ferror(f)) {
2643                         log_error("Failed to resolve user %s.", arg_user);
2644                         return -ESRCH;
2645                 }
2646
2647                 log_error("Failed to read from getent: %m");
2648                 return -errno;
2649         }
2650
2651         truncate_nl(line);
2652
2653         wait_for_terminate_and_warn("getent passwd", pid);
2654
2655         x = strchr(line, ':');
2656         if (!x) {
2657                 log_error("/etc/passwd entry has invalid user field.");
2658                 return -EIO;
2659         }
2660
2661         u = strchr(x+1, ':');
2662         if (!u) {
2663                 log_error("/etc/passwd entry has invalid password field.");
2664                 return -EIO;
2665         }
2666
2667         u++;
2668         g = strchr(u, ':');
2669         if (!g) {
2670                 log_error("/etc/passwd entry has invalid UID field.");
2671                 return -EIO;
2672         }
2673
2674         *g = 0;
2675         g++;
2676         x = strchr(g, ':');
2677         if (!x) {
2678                 log_error("/etc/passwd entry has invalid GID field.");
2679                 return -EIO;
2680         }
2681
2682         *x = 0;
2683         h = strchr(x+1, ':');
2684         if (!h) {
2685                 log_error("/etc/passwd entry has invalid GECOS field.");
2686                 return -EIO;
2687         }
2688
2689         h++;
2690         x = strchr(h, ':');
2691         if (!x) {
2692                 log_error("/etc/passwd entry has invalid home directory field.");
2693                 return -EIO;
2694         }
2695
2696         *x = 0;
2697
2698         r = parse_uid(u, &uid);
2699         if (r < 0) {
2700                 log_error("Failed to parse UID of user.");
2701                 return -EIO;
2702         }
2703
2704         r = parse_gid(g, &gid);
2705         if (r < 0) {
2706                 log_error("Failed to parse GID of user.");
2707                 return -EIO;
2708         }
2709
2710         home = strdup(h);
2711         if (!home)
2712                 return log_oom();
2713
2714         /* Second, get group memberships */
2715         fd = spawn_getent("initgroups", arg_user, &pid);
2716         if (fd < 0)
2717                 return fd;
2718
2719         fclose(f);
2720         f = fdopen(fd, "r");
2721         if (!f)
2722                 return log_oom();
2723         fd = -1;
2724
2725         if (!fgets(line, sizeof(line), f)) {
2726                 if (!ferror(f)) {
2727                         log_error("Failed to resolve user %s.", arg_user);
2728                         return -ESRCH;
2729                 }
2730
2731                 log_error("Failed to read from getent: %m");
2732                 return -errno;
2733         }
2734
2735         truncate_nl(line);
2736
2737         wait_for_terminate_and_warn("getent initgroups", pid);
2738
2739         /* Skip over the username and subsequent separator whitespace */
2740         x = line;
2741         x += strcspn(x, WHITESPACE);
2742         x += strspn(x, WHITESPACE);
2743
2744         FOREACH_WORD(w, l, x, state) {
2745                 char c[l+1];
2746
2747                 memcpy(c, w, l);
2748                 c[l] = 0;
2749
2750                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2751                         return log_oom();
2752
2753                 r = parse_uid(c, &uids[n_uids++]);
2754                 if (r < 0) {
2755                         log_error("Failed to parse group data from getent.");
2756                         return -EIO;
2757                 }
2758         }
2759
2760         r = mkdir_parents(home, 0775);
2761         if (r < 0) {
2762                 log_error("Failed to make home root directory: %s", strerror(-r));
2763                 return r;
2764         }
2765
2766         r = mkdir_safe(home, 0755, uid, gid);
2767         if (r < 0 && r != -EEXIST) {
2768                 log_error("Failed to make home directory: %s", strerror(-r));
2769                 return r;
2770         }
2771
2772         fchown(STDIN_FILENO, uid, gid);
2773         fchown(STDOUT_FILENO, uid, gid);
2774         fchown(STDERR_FILENO, uid, gid);
2775
2776         if (setgroups(n_uids, uids) < 0) {
2777                 log_error("Failed to set auxiliary groups: %m");
2778                 return -errno;
2779         }
2780
2781         if (setresgid(gid, gid, gid) < 0) {
2782                 log_error("setregid() failed: %m");
2783                 return -errno;
2784         }
2785
2786         if (setresuid(uid, uid, uid) < 0) {
2787                 log_error("setreuid() failed: %m");
2788                 return -errno;
2789         }
2790
2791         if (_home) {
2792                 *_home = home;
2793                 home = NULL;
2794         }
2795
2796         return 0;
2797 }
2798
2799 /*
2800  * Return values:
2801  * < 0 : wait_for_terminate() failed to get the state of the
2802  *       container, the container was terminated by a signal, or
2803  *       failed for an unknown reason.  No change is made to the
2804  *       container argument.
2805  * > 0 : The program executed in the container terminated with an
2806  *       error.  The exit code of the program executed in the
2807  *       container is returned.  No change is made to the container
2808  *       argument.
2809  *   0 : The container is being rebooted, has been shut down or exited
2810  *       successfully.  The container argument has been set to either
2811  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2812  *
2813  * That is, success is indicated by a return value of zero, and an
2814  * error is indicated by a non-zero value.
2815  */
2816 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2817         int r;
2818         siginfo_t status;
2819
2820         r = wait_for_terminate(pid, &status);
2821         if (r < 0) {
2822                 log_warning("Failed to wait for container: %s", strerror(-r));
2823                 return r;
2824         }
2825
2826         switch (status.si_code) {
2827         case CLD_EXITED:
2828                 r = status.si_status;
2829                 if (r == 0) {
2830                         if (!arg_quiet)
2831                                 log_debug("Container %s exited successfully.",
2832                                           arg_machine);
2833
2834                         *container = CONTAINER_TERMINATED;
2835                 } else {
2836                         log_error("Container %s failed with error code %i.",
2837                                   arg_machine, status.si_status);
2838                 }
2839                 break;
2840
2841         case CLD_KILLED:
2842                 if (status.si_status == SIGINT) {
2843                         if (!arg_quiet)
2844                                 log_info("Container %s has been shut down.",
2845                                          arg_machine);
2846
2847                         *container = CONTAINER_TERMINATED;
2848                         r = 0;
2849                         break;
2850                 } else if (status.si_status == SIGHUP) {
2851                         if (!arg_quiet)
2852                                 log_info("Container %s is being rebooted.",
2853                                          arg_machine);
2854
2855                         *container = CONTAINER_REBOOTED;
2856                         r = 0;
2857                         break;
2858                 }
2859                 /* CLD_KILLED fallthrough */
2860
2861         case CLD_DUMPED:
2862                 log_error("Container %s terminated by signal %s.",
2863                           arg_machine, signal_to_string(status.si_status));
2864                 r = -1;
2865                 break;
2866
2867         default:
2868                 log_error("Container %s failed due to unknown reason.",
2869                           arg_machine);
2870                 r = -1;
2871                 break;
2872         }
2873
2874         return r;
2875 }
2876
2877 static void nop_handler(int sig) {}
2878
2879 int main(int argc, char *argv[]) {
2880
2881         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2882         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2883         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2884         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2885         _cleanup_fdset_free_ FDSet *fds = NULL;
2886         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2887         const char *console = NULL;
2888         char veth_name[IFNAMSIZ];
2889         bool secondary = false;
2890         sigset_t mask, mask_chld;
2891         pid_t pid = 0;
2892
2893         log_parse_environment();
2894         log_open();
2895
2896         k = parse_argv(argc, argv);
2897         if (k < 0)
2898                 goto finish;
2899         else if (k == 0) {
2900                 r = EXIT_SUCCESS;
2901                 goto finish;
2902         }
2903
2904         if (!arg_image) {
2905                 if (arg_directory) {
2906                         char *p;
2907
2908                         p = path_make_absolute_cwd(arg_directory);
2909                         free(arg_directory);
2910                         arg_directory = p;
2911                 } else
2912                         arg_directory = get_current_dir_name();
2913
2914                 if (!arg_directory) {
2915                         log_error("Failed to determine path, please use -D.");
2916                         goto finish;
2917                 }
2918                 path_kill_slashes(arg_directory);
2919         }
2920
2921         if (!arg_machine) {
2922                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2923                 if (!arg_machine) {
2924                         log_oom();
2925                         goto finish;
2926                 }
2927
2928                 hostname_cleanup(arg_machine, false);
2929                 if (isempty(arg_machine)) {
2930                         log_error("Failed to determine machine name automatically, please use -M.");
2931                         goto finish;
2932                 }
2933         }
2934
2935         if (geteuid() != 0) {
2936                 log_error("Need to be root.");
2937                 goto finish;
2938         }
2939
2940         if (sd_booted() <= 0) {
2941                 log_error("Not running on a systemd system.");
2942                 goto finish;
2943         }
2944
2945         log_close();
2946         n_fd_passed = sd_listen_fds(false);
2947         if (n_fd_passed > 0) {
2948                 k = fdset_new_listen_fds(&fds, false);
2949                 if (k < 0) {
2950                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2951                         goto finish;
2952                 }
2953         }
2954         fdset_close_others(fds);
2955         log_open();
2956
2957         if (arg_directory) {
2958                 if (path_equal(arg_directory, "/")) {
2959                         log_error("Spawning container on root directory not supported.");
2960                         goto finish;
2961                 }
2962
2963                 if (arg_boot) {
2964                         if (path_is_os_tree(arg_directory) <= 0) {
2965                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2966                                 goto finish;
2967                         }
2968                 } else {
2969                         const char *p;
2970
2971                         p = strappenda(arg_directory,
2972                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2973                         if (access(p, F_OK) < 0) {
2974                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2975                                 goto finish;
2976
2977                         }
2978                 }
2979         } else {
2980                 char template[] = "/tmp/nspawn-root-XXXXXX";
2981
2982                 if (!mkdtemp(template)) {
2983                         log_error("Failed to create temporary directory: %m");
2984                         r = -errno;
2985                         goto finish;
2986                 }
2987
2988                 arg_directory = strdup(template);
2989                 if (!arg_directory) {
2990                         r = log_oom();
2991                         goto finish;
2992                 }
2993
2994                 image_fd = setup_image(&device_path, &loop_nr);
2995                 if (image_fd < 0) {
2996                         r = image_fd;
2997                         goto finish;
2998                 }
2999
3000                 r = dissect_image(image_fd,
3001                                   &root_device, &root_device_rw,
3002                                   &home_device, &home_device_rw,
3003                                   &srv_device, &srv_device_rw,
3004                                   &secondary);
3005                 if (r < 0)
3006                         goto finish;
3007         }
3008
3009         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3010         if (master < 0) {
3011                 log_error("Failed to acquire pseudo tty: %m");
3012                 goto finish;
3013         }
3014
3015         console = ptsname(master);
3016         if (!console) {
3017                 log_error("Failed to determine tty name: %m");
3018                 goto finish;
3019         }
3020
3021         if (!arg_quiet)
3022                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3023                          arg_machine, arg_image ? arg_image : arg_directory);
3024
3025         if (unlockpt(master) < 0) {
3026                 log_error("Failed to unlock tty: %m");
3027                 goto finish;
3028         }
3029
3030         if (access("/dev/kdbus/control", F_OK) >= 0) {
3031
3032                 if (arg_share_system) {
3033                         kdbus_domain = strdup("/dev/kdbus");
3034                         if (!kdbus_domain) {
3035                                 log_oom();
3036                                 goto finish;
3037                         }
3038                 } else {
3039                         const char *ns;
3040
3041                         ns = strappenda("machine-", arg_machine);
3042                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3043                         if (r < 0)
3044                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3045                         else
3046                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3047                 }
3048         }
3049
3050         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3051                 log_error("Failed to create kmsg socket pair: %m");
3052                 goto finish;
3053         }
3054
3055         sd_notify(0, "READY=1");
3056
3057         assert_se(sigemptyset(&mask) == 0);
3058         assert_se(sigemptyset(&mask_chld) == 0);
3059         sigaddset(&mask_chld, SIGCHLD);
3060         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3061         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3062
3063         for (;;) {
3064                 ContainerStatus container_status;
3065                 int eventfds[2] = { -1, -1 };
3066                 struct sigaction sa = {
3067                         .sa_handler = nop_handler,
3068                         .sa_flags = SA_NOCLDSTOP,
3069                 };
3070
3071                 /* Child can be killed before execv(), so handle SIGCHLD
3072                  * in order to interrupt parent's blocking calls and
3073                  * give it a chance to call wait() and terminate. */
3074                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3075                 if (r < 0) {
3076                         log_error("Failed to change the signal mask: %m");
3077                         goto finish;
3078                 }
3079
3080                 r = sigaction(SIGCHLD, &sa, NULL);
3081                 if (r < 0) {
3082                         log_error("Failed to install SIGCHLD handler: %m");
3083                         goto finish;
3084                 }
3085
3086                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
3087                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3088                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
3089                 if (pid < 0) {
3090                         if (errno == EINVAL)
3091                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3092                         else
3093                                 log_error("clone() failed: %m");
3094
3095                         r = pid;
3096                         goto finish;
3097                 }
3098
3099                 if (pid == 0) {
3100                         /* child */
3101                         _cleanup_free_ char *home = NULL;
3102                         unsigned n_env = 2;
3103                         const char *envp[] = {
3104                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3105                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3106                                 NULL, /* TERM */
3107                                 NULL, /* HOME */
3108                                 NULL, /* USER */
3109                                 NULL, /* LOGNAME */
3110                                 NULL, /* container_uuid */
3111                                 NULL, /* LISTEN_FDS */
3112                                 NULL, /* LISTEN_PID */
3113                                 NULL
3114                         };
3115                         char **env_use;
3116
3117                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3118                         if (envp[n_env])
3119                                 n_env ++;
3120
3121                         master = safe_close(master);
3122
3123                         close_nointr(STDIN_FILENO);
3124                         close_nointr(STDOUT_FILENO);
3125                         close_nointr(STDERR_FILENO);
3126
3127                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3128
3129                         reset_all_signal_handlers();
3130
3131                         assert_se(sigemptyset(&mask) == 0);
3132                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3133
3134                         k = open_terminal(console, O_RDWR);
3135                         if (k != STDIN_FILENO) {
3136                                 if (k >= 0) {
3137                                         safe_close(k);
3138                                         k = -EINVAL;
3139                                 }
3140
3141                                 log_error("Failed to open console: %s", strerror(-k));
3142                                 goto child_fail;
3143                         }
3144
3145                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3146                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3147                                 log_error("Failed to duplicate console: %m");
3148                                 goto child_fail;
3149                         }
3150
3151                         if (setsid() < 0) {
3152                                 log_error("setsid() failed: %m");
3153                                 goto child_fail;
3154                         }
3155
3156                         if (reset_audit_loginuid() < 0)
3157                                 goto child_fail;
3158
3159                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3160                                 log_error("PR_SET_PDEATHSIG failed: %m");
3161                                 goto child_fail;
3162                         }
3163
3164                         /* Mark everything as slave, so that we still
3165                          * receive mounts from the real root, but don't
3166                          * propagate mounts to the real root. */
3167                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3168                                 log_error("MS_SLAVE|MS_REC failed: %m");
3169                                 goto child_fail;
3170                         }
3171
3172                         if (mount_devices(arg_directory,
3173                                           root_device, root_device_rw,
3174                                           home_device, home_device_rw,
3175                                           srv_device, srv_device_rw) < 0)
3176                                 goto child_fail;
3177
3178                         /* Turn directory into bind mount */
3179                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3180                                 log_error("Failed to make bind mount: %m");
3181                                 goto child_fail;
3182                         }
3183
3184                         r = setup_volatile(arg_directory);
3185                         if (r < 0)
3186                                 goto child_fail;
3187
3188                         if (setup_volatile_state(arg_directory) < 0)
3189                                 goto child_fail;
3190
3191                         r = base_filesystem_create(arg_directory);
3192                         if (r < 0)
3193                                 goto child_fail;
3194
3195                         if (arg_read_only) {
3196                                 k = bind_remount_recursive(arg_directory, true);
3197                                 if (k < 0) {
3198                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3199                                         goto child_fail;
3200                                 }
3201                         }
3202
3203                         if (mount_all(arg_directory) < 0)
3204                                 goto child_fail;
3205
3206                         if (copy_devnodes(arg_directory) < 0)
3207                                 goto child_fail;
3208
3209                         if (setup_ptmx(arg_directory) < 0)
3210                                 goto child_fail;
3211
3212                         dev_setup(arg_directory);
3213
3214                         if (setup_seccomp() < 0)
3215                                 goto child_fail;
3216
3217                         if (setup_dev_console(arg_directory, console) < 0)
3218                                 goto child_fail;
3219
3220                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3221                                 goto child_fail;
3222
3223                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3224
3225                         if (setup_boot_id(arg_directory) < 0)
3226                                 goto child_fail;
3227
3228                         if (setup_timezone(arg_directory) < 0)
3229                                 goto child_fail;
3230
3231                         if (setup_resolv_conf(arg_directory) < 0)
3232                                 goto child_fail;
3233
3234                         if (setup_journal(arg_directory) < 0)
3235                                 goto child_fail;
3236
3237                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3238                                 goto child_fail;
3239
3240                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3241                                 goto child_fail;
3242
3243                         if (mount_tmpfs(arg_directory) < 0)
3244                                 goto child_fail;
3245
3246                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3247                                 goto child_fail;
3248
3249                         /* Tell the parent that we are ready, and that
3250                          * it can cgroupify us to that we lack access
3251                          * to certain devices and resources. */
3252                         r = eventfd_send_state(eventfds[1],
3253                                                EVENTFD_CHILD_SUCCEEDED);
3254                         eventfds[1] = safe_close(eventfds[1]);
3255                         if (r < 0)
3256                                 goto child_fail;
3257
3258                         if (chdir(arg_directory) < 0) {
3259                                 log_error("chdir(%s) failed: %m", arg_directory);
3260                                 goto child_fail;
3261                         }
3262
3263                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3264                                 log_error("mount(MS_MOVE) failed: %m");
3265                                 goto child_fail;
3266                         }
3267
3268                         if (chroot(".") < 0) {
3269                                 log_error("chroot() failed: %m");
3270                                 goto child_fail;
3271                         }
3272
3273                         if (chdir("/") < 0) {
3274                                 log_error("chdir() failed: %m");
3275                                 goto child_fail;
3276                         }
3277
3278                         umask(0022);
3279
3280                         if (arg_private_network)
3281                                 loopback_setup();
3282
3283                         if (drop_capabilities() < 0) {
3284                                 log_error("drop_capabilities() failed: %m");
3285                                 goto child_fail;
3286                         }
3287
3288                         r = change_uid_gid(&home);
3289                         if (r < 0)
3290                                 goto child_fail;
3291
3292                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3293                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3294                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3295                                 log_oom();
3296                                 goto child_fail;
3297                         }
3298
3299                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3300                                 char as_uuid[37];
3301
3302                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3303                                         log_oom();
3304                                         goto child_fail;
3305                                 }
3306                         }
3307
3308                         if (fdset_size(fds) > 0) {
3309                                 k = fdset_cloexec(fds, false);
3310                                 if (k < 0) {
3311                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3312                                         goto child_fail;
3313                                 }
3314
3315                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3316                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3317                                         log_oom();
3318                                         goto child_fail;
3319                                 }
3320                         }
3321
3322                         setup_hostname();
3323
3324                         if (arg_personality != 0xffffffffLU) {
3325                                 if (personality(arg_personality) < 0) {
3326                                         log_error("personality() failed: %m");
3327                                         goto child_fail;
3328                                 }
3329                         } else if (secondary) {
3330                                 if (personality(PER_LINUX32) < 0) {
3331                                         log_error("personality() failed: %m");
3332                                         goto child_fail;
3333                                 }
3334                         }
3335
3336 #ifdef HAVE_SELINUX
3337                         if (arg_selinux_context)
3338                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3339                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3340                                         goto child_fail;
3341                                 }
3342 #endif
3343
3344                         if (!strv_isempty(arg_setenv)) {
3345                                 char **n;
3346
3347                                 n = strv_env_merge(2, envp, arg_setenv);
3348                                 if (!n) {
3349                                         log_oom();
3350                                         goto child_fail;
3351                                 }
3352
3353                                 env_use = n;
3354                         } else
3355                                 env_use = (char**) envp;
3356
3357                         /* Wait until the parent is ready with the setup, too... */
3358                         r = eventfd_parent_succeeded(eventfds[0]);
3359                         eventfds[0] = safe_close(eventfds[0]);
3360                         if (r < 0)
3361                                 goto child_fail;
3362
3363                         if (arg_boot) {
3364                                 char **a;
3365                                 size_t l;
3366
3367                                 /* Automatically search for the init system */
3368
3369                                 l = 1 + argc - optind;
3370                                 a = newa(char*, l + 1);
3371                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3372
3373                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3374                                 execve(a[0], a, env_use);
3375
3376                                 a[0] = (char*) "/lib/systemd/systemd";
3377                                 execve(a[0], a, env_use);
3378
3379                                 a[0] = (char*) "/sbin/init";
3380                                 execve(a[0], a, env_use);
3381                         } else if (argc > optind)
3382                                 execvpe(argv[optind], argv + optind, env_use);
3383                         else {
3384                                 chdir(home ? home : "/root");
3385                                 execle("/bin/bash", "-bash", NULL, env_use);
3386                                 execle("/bin/sh", "-sh", NULL, env_use);
3387                         }
3388
3389                         log_error("execv() failed: %m");
3390
3391                 child_fail:
3392                         /* Tell the parent that the setup failed, so he
3393                          * can clean up resources and terminate. */
3394                         if (eventfds[1] != -1)
3395                                 eventfd_send_state(eventfds[1],
3396                                                    EVENTFD_CHILD_FAILED);
3397                         _exit(EXIT_FAILURE);
3398                 }
3399
3400                 fdset_free(fds);
3401                 fds = NULL;
3402
3403                 /* Wait for the child event:
3404                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3405                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3406                  * it is ready with all it needs to do with priviliges.
3407                  * After we got the notification we can make the process
3408                  * join its cgroup which might limit what it can do */
3409                 r = eventfd_child_succeeded(eventfds[1]);
3410                 eventfds[1] = safe_close(eventfds[1]);
3411
3412                 if (r >= 0) {
3413                         r = register_machine(pid);
3414                         if (r < 0)
3415                                 goto finish;
3416
3417                         r = move_network_interfaces(pid);
3418                         if (r < 0)
3419                                 goto finish;
3420
3421                         r = setup_veth(pid, veth_name);
3422                         if (r < 0)
3423                                 goto finish;
3424
3425                         r = setup_bridge(veth_name);
3426                         if (r < 0)
3427                                 goto finish;
3428
3429                         r = setup_macvlan(pid);
3430                         if (r < 0)
3431                                 goto finish;
3432
3433                         /* Block SIGCHLD here, before notifying child.
3434                          * process_pty() will handle it with the other signals. */
3435                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3436                         if (r < 0)
3437                                 goto finish;
3438
3439                         /* Reset signal to default */
3440                         r = default_signals(SIGCHLD, -1);
3441                         if (r < 0)
3442                                 goto finish;
3443
3444                         /* Notify the child that the parent is ready with all
3445                          * its setup, and that the child can now hand over
3446                          * control to the code to run inside the container. */
3447                         r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
3448                         eventfds[0] = safe_close(eventfds[0]);
3449                         if (r < 0)
3450                                 goto finish;
3451
3452                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3453                         if (k < 0) {
3454                                 r = EXIT_FAILURE;
3455                                 break;
3456                         }
3457
3458                         if (!arg_quiet)
3459                                 putc('\n', stdout);
3460
3461                         /* Kill if it is not dead yet anyway */
3462                         terminate_machine(pid);
3463                 }
3464
3465                 /* Normally redundant, but better safe than sorry */
3466                 kill(pid, SIGKILL);
3467
3468                 r = wait_for_container(pid, &container_status);
3469                 pid = 0;
3470
3471                 if (r < 0) {
3472                         /* We failed to wait for the container, or the
3473                          * container exited abnormally */
3474                         r = EXIT_FAILURE;
3475                         break;
3476                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3477                         /* The container exited with a non-zero
3478                          * status, or with zero status and no reboot
3479                          * was requested. */
3480                         break;
3481
3482                 /* CONTAINER_REBOOTED, loop again */
3483
3484                 if (arg_keep_unit) {
3485                         /* Special handling if we are running as a
3486                          * service: instead of simply restarting the
3487                          * machine we want to restart the entire
3488                          * service, so let's inform systemd about this
3489                          * with the special exit code 133. The service
3490                          * file uses RestartForceExitStatus=133 so
3491                          * that this results in a full nspawn
3492                          * restart. This is necessary since we might
3493                          * have cgroup parameters set we want to have
3494                          * flushed out. */
3495                         r = 133;
3496                         break;
3497                 }
3498         }
3499
3500 finish:
3501         loop_remove(loop_nr, &image_fd);
3502
3503         if (pid > 0)
3504                 kill(pid, SIGKILL);
3505
3506         free(arg_directory);
3507         free(arg_machine);
3508         free(arg_user);
3509         strv_free(arg_setenv);
3510         strv_free(arg_network_interfaces);
3511         strv_free(arg_network_macvlan);
3512         strv_free(arg_bind);
3513         strv_free(arg_bind_ro);
3514         strv_free(arg_tmpfs);
3515
3516         return r;
3517 }