chiark / gitweb /
nspawn: register external network interface with machined
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91 #include "copy.h"
92 #include "base-filesystem.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static uint64_t arg_retain =
128         (1ULL << CAP_CHOWN) |
129         (1ULL << CAP_DAC_OVERRIDE) |
130         (1ULL << CAP_DAC_READ_SEARCH) |
131         (1ULL << CAP_FOWNER) |
132         (1ULL << CAP_FSETID) |
133         (1ULL << CAP_IPC_OWNER) |
134         (1ULL << CAP_KILL) |
135         (1ULL << CAP_LEASE) |
136         (1ULL << CAP_LINUX_IMMUTABLE) |
137         (1ULL << CAP_NET_BIND_SERVICE) |
138         (1ULL << CAP_NET_BROADCAST) |
139         (1ULL << CAP_NET_RAW) |
140         (1ULL << CAP_SETGID) |
141         (1ULL << CAP_SETFCAP) |
142         (1ULL << CAP_SETPCAP) |
143         (1ULL << CAP_SETUID) |
144         (1ULL << CAP_SYS_ADMIN) |
145         (1ULL << CAP_SYS_CHROOT) |
146         (1ULL << CAP_SYS_NICE) |
147         (1ULL << CAP_SYS_PTRACE) |
148         (1ULL << CAP_SYS_TTY_CONFIG) |
149         (1ULL << CAP_SYS_RESOURCE) |
150         (1ULL << CAP_SYS_BOOT) |
151         (1ULL << CAP_AUDIT_WRITE) |
152         (1ULL << CAP_AUDIT_CONTROL) |
153         (1ULL << CAP_MKNOD);
154 static char **arg_bind = NULL;
155 static char **arg_bind_ro = NULL;
156 static char **arg_tmpfs = NULL;
157 static char **arg_setenv = NULL;
158 static bool arg_quiet = false;
159 static bool arg_share_system = false;
160 static bool arg_register = true;
161 static bool arg_keep_unit = false;
162 static char **arg_network_interfaces = NULL;
163 static char **arg_network_macvlan = NULL;
164 static bool arg_network_veth = false;
165 static const char *arg_network_bridge = NULL;
166 static unsigned long arg_personality = 0xffffffffLU;
167 static const char *arg_image = NULL;
168 static Volatile arg_volatile = VOLATILE_NO;
169
170 static int help(void) {
171
172         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174                "  -h --help                 Show this help\n"
175                "     --version              Print version string\n"
176                "  -q --quiet                Do not show status information\n"
177                "  -D --directory=PATH       Root directory for the container\n"
178                "  -i --image=PATH           File system device or image for the container\n"
179                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
180                "  -u --user=USER            Run the command under specified user or uid\n"
181                "  -M --machine=NAME         Set the machine name for the container\n"
182                "     --uuid=UUID            Set a specific machine UUID for the container\n"
183                "  -S --slice=SLICE          Place the container in the specified slice\n"
184                "     --private-network      Disable network in container\n"
185                "     --network-interface=INTERFACE\n"
186                "                            Assign an existing network interface to the\n"
187                "                            container\n"
188                "     --network-macvlan=INTERFACE\n"
189                "                            Create a macvlan network interface based on an\n"
190                "                            existing network interface to the container\n"
191                "     --network-veth         Add a virtual ethernet connection between host\n"
192                "                            and container\n"
193                "     --network-bridge=INTERFACE\n"
194                "                            Add a virtual ethernet connection between host\n"
195                "                            and container and add it to an existing bridge on\n"
196                "                            the host\n"
197                "  -Z --selinux-context=SECLABEL\n"
198                "                            Set the SELinux security context to be used by\n"
199                "                            processes in the container\n"
200                "  -L --selinux-apifs-context=SECLABEL\n"
201                "                            Set the SELinux security context to be used by\n"
202                "                            API/tmpfs file systems in the container\n"
203                "     --capability=CAP       In addition to the default, retain specified\n"
204                "                            capability\n"
205                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
206                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
207                "  -j                        Equivalent to --link-journal=host\n"
208                "     --read-only            Mount the root directory read-only\n"
209                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
210                "                            the container\n"
211                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
212                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
213                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
214                "     --share-system         Share system namespaces with host\n"
215                "     --register=BOOLEAN     Register container as machine\n"
216                "     --keep-unit            Do not register a scope for the machine, reuse\n"
217                "                            the service unit nspawn is running in\n"
218                "     --volatile[=MODE]      Run the system in volatile mode\n",
219                program_invocation_short_name);
220
221         return 0;
222 }
223
224 static int parse_argv(int argc, char *argv[]) {
225
226         enum {
227                 ARG_VERSION = 0x100,
228                 ARG_PRIVATE_NETWORK,
229                 ARG_UUID,
230                 ARG_READ_ONLY,
231                 ARG_CAPABILITY,
232                 ARG_DROP_CAPABILITY,
233                 ARG_LINK_JOURNAL,
234                 ARG_BIND,
235                 ARG_BIND_RO,
236                 ARG_TMPFS,
237                 ARG_SETENV,
238                 ARG_SHARE_SYSTEM,
239                 ARG_REGISTER,
240                 ARG_KEEP_UNIT,
241                 ARG_NETWORK_INTERFACE,
242                 ARG_NETWORK_MACVLAN,
243                 ARG_NETWORK_VETH,
244                 ARG_NETWORK_BRIDGE,
245                 ARG_PERSONALITY,
246                 ARG_VOLATILE,
247         };
248
249         static const struct option options[] = {
250                 { "help",                  no_argument,       NULL, 'h'                   },
251                 { "version",               no_argument,       NULL, ARG_VERSION           },
252                 { "directory",             required_argument, NULL, 'D'                   },
253                 { "user",                  required_argument, NULL, 'u'                   },
254                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
255                 { "boot",                  no_argument,       NULL, 'b'                   },
256                 { "uuid",                  required_argument, NULL, ARG_UUID              },
257                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
258                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
259                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
260                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
261                 { "bind",                  required_argument, NULL, ARG_BIND              },
262                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
263                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
264                 { "machine",               required_argument, NULL, 'M'                   },
265                 { "slice",                 required_argument, NULL, 'S'                   },
266                 { "setenv",                required_argument, NULL, ARG_SETENV            },
267                 { "selinux-context",       required_argument, NULL, 'Z'                   },
268                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
269                 { "quiet",                 no_argument,       NULL, 'q'                   },
270                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
271                 { "register",              required_argument, NULL, ARG_REGISTER          },
272                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
273                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
274                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
275                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
276                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
277                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
278                 { "image",                 required_argument, NULL, 'i'                   },
279                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
280                 {}
281         };
282
283         int c, r;
284         uint64_t plus = 0, minus = 0;
285
286         assert(argc >= 0);
287         assert(argv);
288
289         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
290
291                 switch (c) {
292
293                 case 'h':
294                         return help();
295
296                 case ARG_VERSION:
297                         puts(PACKAGE_STRING);
298                         puts(SYSTEMD_FEATURES);
299                         return 0;
300
301                 case 'D':
302                         free(arg_directory);
303                         arg_directory = canonicalize_file_name(optarg);
304                         if (!arg_directory) {
305                                 log_error("Invalid root directory: %m");
306                                 return -ENOMEM;
307                         }
308
309                         break;
310
311                 case 'i':
312                         arg_image = optarg;
313                         break;
314
315                 case 'u':
316                         free(arg_user);
317                         arg_user = strdup(optarg);
318                         if (!arg_user)
319                                 return log_oom();
320
321                         break;
322
323                 case ARG_NETWORK_BRIDGE:
324                         arg_network_bridge = optarg;
325
326                         /* fall through */
327
328                 case ARG_NETWORK_VETH:
329                         arg_network_veth = true;
330                         arg_private_network = true;
331                         break;
332
333                 case ARG_NETWORK_INTERFACE:
334                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
335                                 return log_oom();
336
337                         arg_private_network = true;
338                         break;
339
340                 case ARG_NETWORK_MACVLAN:
341                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
342                                 return log_oom();
343
344                         /* fall through */
345
346                 case ARG_PRIVATE_NETWORK:
347                         arg_private_network = true;
348                         break;
349
350                 case 'b':
351                         arg_boot = true;
352                         break;
353
354                 case ARG_UUID:
355                         r = sd_id128_from_string(optarg, &arg_uuid);
356                         if (r < 0) {
357                                 log_error("Invalid UUID: %s", optarg);
358                                 return r;
359                         }
360                         break;
361
362                 case 'S':
363                         arg_slice = optarg;
364                         break;
365
366                 case 'M':
367                         if (isempty(optarg)) {
368                                 free(arg_machine);
369                                 arg_machine = NULL;
370                         } else {
371
372                                 if (!hostname_is_valid(optarg)) {
373                                         log_error("Invalid machine name: %s", optarg);
374                                         return -EINVAL;
375                                 }
376
377                                 free(arg_machine);
378                                 arg_machine = strdup(optarg);
379                                 if (!arg_machine)
380                                         return log_oom();
381
382                                 break;
383                         }
384
385                 case 'Z':
386                         arg_selinux_context = optarg;
387                         break;
388
389                 case 'L':
390                         arg_selinux_apifs_context = optarg;
391                         break;
392
393                 case ARG_READ_ONLY:
394                         arg_read_only = true;
395                         break;
396
397                 case ARG_CAPABILITY:
398                 case ARG_DROP_CAPABILITY: {
399                         char *state, *word;
400                         size_t length;
401
402                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403                                 _cleanup_free_ char *t;
404                                 cap_value_t cap;
405
406                                 t = strndup(word, length);
407                                 if (!t)
408                                         return log_oom();
409
410                                 if (streq(t, "all")) {
411                                         if (c == ARG_CAPABILITY)
412                                                 plus = (uint64_t) -1;
413                                         else
414                                                 minus = (uint64_t) -1;
415                                 } else {
416                                         if (cap_from_name(t, &cap) < 0) {
417                                                 log_error("Failed to parse capability %s.", t);
418                                                 return -EINVAL;
419                                         }
420
421                                         if (c == ARG_CAPABILITY)
422                                                 plus |= 1ULL << (uint64_t) cap;
423                                         else
424                                                 minus |= 1ULL << (uint64_t) cap;
425                                 }
426                         }
427
428                         break;
429                 }
430
431                 case 'j':
432                         arg_link_journal = LINK_GUEST;
433                         break;
434
435                 case ARG_LINK_JOURNAL:
436                         if (streq(optarg, "auto"))
437                                 arg_link_journal = LINK_AUTO;
438                         else if (streq(optarg, "no"))
439                                 arg_link_journal = LINK_NO;
440                         else if (streq(optarg, "guest"))
441                                 arg_link_journal = LINK_GUEST;
442                         else if (streq(optarg, "host"))
443                                 arg_link_journal = LINK_HOST;
444                         else {
445                                 log_error("Failed to parse link journal mode %s", optarg);
446                                 return -EINVAL;
447                         }
448
449                         break;
450
451                 case ARG_BIND:
452                 case ARG_BIND_RO: {
453                         _cleanup_free_ char *a = NULL, *b = NULL;
454                         char *e;
455                         char ***x;
456
457                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
458
459                         e = strchr(optarg, ':');
460                         if (e) {
461                                 a = strndup(optarg, e - optarg);
462                                 b = strdup(e + 1);
463                         } else {
464                                 a = strdup(optarg);
465                                 b = strdup(optarg);
466                         }
467
468                         if (!a || !b)
469                                 return log_oom();
470
471                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
472                                 log_error("Invalid bind mount specification: %s", optarg);
473                                 return -EINVAL;
474                         }
475
476                         r = strv_extend(x, a);
477                         if (r < 0)
478                                 return log_oom();
479
480                         r = strv_extend(x, b);
481                         if (r < 0)
482                                 return log_oom();
483
484                         break;
485                 }
486
487                 case ARG_TMPFS: {
488                         _cleanup_free_ char *a = NULL, *b = NULL;
489                         char *e;
490
491                         e = strchr(optarg, ':');
492                         if (e) {
493                                 a = strndup(optarg, e - optarg);
494                                 b = strdup(e + 1);
495                         } else {
496                                 a = strdup(optarg);
497                                 b = strdup("mode=0755");
498                         }
499
500                         if (!a || !b)
501                                 return log_oom();
502
503                         if (!path_is_absolute(a)) {
504                                 log_error("Invalid tmpfs specification: %s", optarg);
505                                 return -EINVAL;
506                         }
507
508                         r = strv_push(&arg_tmpfs, a);
509                         if (r < 0)
510                                 return log_oom();
511
512                         a = NULL;
513
514                         r = strv_push(&arg_tmpfs, b);
515                         if (r < 0)
516                                 return log_oom();
517
518                         b = NULL;
519
520                         break;
521                 }
522
523                 case ARG_SETENV: {
524                         char **n;
525
526                         if (!env_assignment_is_valid(optarg)) {
527                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
528                                 return -EINVAL;
529                         }
530
531                         n = strv_env_set(arg_setenv, optarg);
532                         if (!n)
533                                 return log_oom();
534
535                         strv_free(arg_setenv);
536                         arg_setenv = n;
537                         break;
538                 }
539
540                 case 'q':
541                         arg_quiet = true;
542                         break;
543
544                 case ARG_SHARE_SYSTEM:
545                         arg_share_system = true;
546                         break;
547
548                 case ARG_REGISTER:
549                         r = parse_boolean(optarg);
550                         if (r < 0) {
551                                 log_error("Failed to parse --register= argument: %s", optarg);
552                                 return r;
553                         }
554
555                         arg_register = r;
556                         break;
557
558                 case ARG_KEEP_UNIT:
559                         arg_keep_unit = true;
560                         break;
561
562                 case ARG_PERSONALITY:
563
564                         arg_personality = personality_from_string(optarg);
565                         if (arg_personality == 0xffffffffLU) {
566                                 log_error("Unknown or unsupported personality '%s'.", optarg);
567                                 return -EINVAL;
568                         }
569
570                         break;
571
572                 case ARG_VOLATILE:
573
574                         if (!optarg)
575                                 arg_volatile = VOLATILE_YES;
576                         else {
577                                 r = parse_boolean(optarg);
578                                 if (r < 0) {
579                                         if (streq(optarg, "state"))
580                                                 arg_volatile = VOLATILE_STATE;
581                                         else {
582                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
583                                                 return r;
584                                         }
585                                 } else
586                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
587                         }
588
589                         break;
590
591                 case '?':
592                         return -EINVAL;
593
594                 default:
595                         assert_not_reached("Unhandled option");
596                 }
597         }
598
599         if (arg_share_system)
600                 arg_register = false;
601
602         if (arg_boot && arg_share_system) {
603                 log_error("--boot and --share-system may not be combined.");
604                 return -EINVAL;
605         }
606
607         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
608                 log_error("--keep-unit may not be used when invoked from a user session.");
609                 return -EINVAL;
610         }
611
612         if (arg_directory && arg_image) {
613                 log_error("--directory= and --image= may not be combined.");
614                 return -EINVAL;
615         }
616
617         if (arg_volatile != VOLATILE_NO && arg_read_only) {
618                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
619                 return -EINVAL;
620         }
621
622         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
623
624         return 1;
625 }
626
627 static int mount_all(const char *dest) {
628
629         typedef struct MountPoint {
630                 const char *what;
631                 const char *where;
632                 const char *type;
633                 const char *options;
634                 unsigned long flags;
635                 bool fatal;
636         } MountPoint;
637
638         static const MountPoint mount_table[] = {
639                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
640                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
641                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
642                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
643                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
644                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
645                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
646                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
647 #ifdef HAVE_SELINUX
648                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
649                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
650 #endif
651         };
652
653         unsigned k;
654         int r = 0;
655
656         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
657                 _cleanup_free_ char *where = NULL;
658 #ifdef HAVE_SELINUX
659                 _cleanup_free_ char *options = NULL;
660 #endif
661                 const char *o;
662                 int t;
663
664                 where = strjoin(dest, "/", mount_table[k].where, NULL);
665                 if (!where)
666                         return log_oom();
667
668                 t = path_is_mount_point(where, true);
669                 if (t < 0) {
670                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
671
672                         if (r == 0)
673                                 r = t;
674
675                         continue;
676                 }
677
678                 /* Skip this entry if it is not a remount. */
679                 if (mount_table[k].what && t > 0)
680                         continue;
681
682                 mkdir_p(where, 0755);
683
684 #ifdef HAVE_SELINUX
685                 if (arg_selinux_apifs_context &&
686                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
687                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
688                         if (!options)
689                                 return log_oom();
690
691                         o = options;
692                 } else
693 #endif
694                         o = mount_table[k].options;
695
696
697                 if (mount(mount_table[k].what,
698                           where,
699                           mount_table[k].type,
700                           mount_table[k].flags,
701                           o) < 0 &&
702                     mount_table[k].fatal) {
703
704                         log_error("mount(%s) failed: %m", where);
705
706                         if (r == 0)
707                                 r = -errno;
708                 }
709         }
710
711         return r;
712 }
713
714 static int mount_binds(const char *dest, char **l, bool ro) {
715         char **x, **y;
716
717         STRV_FOREACH_PAIR(x, y, l) {
718                 _cleanup_free_ char *where = NULL;
719                 struct stat source_st, dest_st;
720                 int r;
721
722                 if (stat(*x, &source_st) < 0) {
723                         log_error("Failed to stat %s: %m", *x);
724                         return -errno;
725                 }
726
727                 where = strappend(dest, *y);
728                 if (!where)
729                         return log_oom();
730
731                 r = stat(where, &dest_st);
732                 if (r == 0) {
733                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
734                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
735                                 return -EINVAL;
736                         }
737                 } else if (errno == ENOENT) {
738                         r = mkdir_parents_label(where, 0755);
739                         if (r < 0) {
740                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
741                                 return r;
742                         }
743                 } else {
744                         log_error("Failed to bind mount %s: %m", *x);
745                         return -errno;
746                 }
747
748                 /* Create the mount point, but be conservative -- refuse to create block
749                  * and char devices. */
750                 if (S_ISDIR(source_st.st_mode))
751                         mkdir_label(where, 0755);
752                 else if (S_ISFIFO(source_st.st_mode))
753                         mkfifo(where, 0644);
754                 else if (S_ISSOCK(source_st.st_mode))
755                         mknod(where, 0644 | S_IFSOCK, 0);
756                 else if (S_ISREG(source_st.st_mode))
757                         touch(where);
758                 else {
759                         log_error("Refusing to create mountpoint for file: %s", *x);
760                         return -ENOTSUP;
761                 }
762
763                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
764                         log_error("mount(%s) failed: %m", where);
765                         return -errno;
766                 }
767
768                 if (ro) {
769                         r = bind_remount_recursive(where, true);
770                         if (r < 0) {
771                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
772                                 return r;
773                         }
774                 }
775         }
776
777         return 0;
778 }
779
780 static int mount_tmpfs(const char *dest) {
781         char **i, **o;
782
783         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
784                 _cleanup_free_ char *where = NULL;
785
786                 where = strappend(dest, *i);
787                 if (!where)
788                         return log_oom();
789
790                 mkdir_label(where, 0755);
791
792                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
793                         log_error("tmpfs mount to %s failed: %m", where);
794                         return -errno;
795                 }
796         }
797
798         return 0;
799 }
800
801 static int setup_timezone(const char *dest) {
802         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
803         char *z, *y;
804         int r;
805
806         assert(dest);
807
808         /* Fix the timezone, if possible */
809         r = readlink_malloc("/etc/localtime", &p);
810         if (r < 0) {
811                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
812                 return 0;
813         }
814
815         z = path_startswith(p, "../usr/share/zoneinfo/");
816         if (!z)
817                 z = path_startswith(p, "/usr/share/zoneinfo/");
818         if (!z) {
819                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
820                 return 0;
821         }
822
823         where = strappend(dest, "/etc/localtime");
824         if (!where)
825                 return log_oom();
826
827         r = readlink_malloc(where, &q);
828         if (r >= 0) {
829                 y = path_startswith(q, "../usr/share/zoneinfo/");
830                 if (!y)
831                         y = path_startswith(q, "/usr/share/zoneinfo/");
832
833                 /* Already pointing to the right place? Then do nothing .. */
834                 if (y && streq(y, z))
835                         return 0;
836         }
837
838         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
839         if (!check)
840                 return log_oom();
841
842         if (access(check, F_OK) < 0) {
843                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
844                 return 0;
845         }
846
847         what = strappend("../usr/share/zoneinfo/", z);
848         if (!what)
849                 return log_oom();
850
851         mkdir_parents(where, 0755);
852         unlink(where);
853
854         if (symlink(what, where) < 0) {
855                 log_error("Failed to correct timezone of container: %m");
856                 return 0;
857         }
858
859         return 0;
860 }
861
862 static int setup_resolv_conf(const char *dest) {
863         _cleanup_free_ char *where = NULL;
864
865         assert(dest);
866
867         if (arg_private_network)
868                 return 0;
869
870         /* Fix resolv.conf, if possible */
871         where = strappend(dest, "/etc/resolv.conf");
872         if (!where)
873                 return log_oom();
874
875         /* We don't really care for the results of this really. If it
876          * fails, it fails, but meh... */
877         mkdir_parents(where, 0755);
878         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
879
880         return 0;
881 }
882
883 static int setup_volatile_state(const char *directory) {
884         const char *p;
885         int r;
886
887         assert(directory);
888
889         if (arg_volatile != VOLATILE_STATE)
890                 return 0;
891
892         /* --volatile=state means we simply overmount /var
893            with a tmpfs, and the rest read-only. */
894
895         r = bind_remount_recursive(directory, true);
896         if (r < 0) {
897                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
898                 return r;
899         }
900
901         p = strappenda(directory, "/var");
902         mkdir(p, 0755);
903
904         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
905                 log_error("Failed to mount tmpfs to /var: %m");
906                 return -errno;
907         }
908
909         return 0;
910 }
911
912 static int setup_volatile(const char *directory) {
913         bool tmpfs_mounted = false, bind_mounted = false;
914         char template[] = "/tmp/nspawn-volatile-XXXXXX";
915         const char *f, *t;
916         int r;
917
918         assert(directory);
919
920         if (arg_volatile != VOLATILE_YES)
921                 return 0;
922
923         /* --volatile=yes means we mount a tmpfs to the root dir, and
924            the original /usr to use inside it, and that read-only. */
925
926         if (!mkdtemp(template)) {
927                 log_error("Failed to create temporary directory: %m");
928                 return -errno;
929         }
930
931         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
932                 log_error("Failed to mount tmpfs for root directory: %m");
933                 r = -errno;
934                 goto fail;
935         }
936
937         tmpfs_mounted = true;
938
939         f = strappenda(directory, "/usr");
940         t = strappenda(template, "/usr");
941
942         mkdir(t, 0755);
943         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
944                 log_error("Failed to create /usr bind mount: %m");
945                 r = -errno;
946                 goto fail;
947         }
948
949         bind_mounted = true;
950
951         r = bind_remount_recursive(t, true);
952         if (r < 0) {
953                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
954                 goto fail;
955         }
956
957         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
958                 log_error("Failed to move root mount: %m");
959                 r = -errno;
960                 goto fail;
961         }
962
963         rmdir(template);
964
965         return 0;
966
967 fail:
968         if (bind_mounted)
969                 umount(t);
970         if (tmpfs_mounted)
971                 umount(template);
972         rmdir(template);
973         return r;
974 }
975
976 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
977
978         snprintf(s, 37,
979                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
980                  SD_ID128_FORMAT_VAL(id));
981
982         return s;
983 }
984
985 static int setup_boot_id(const char *dest) {
986         _cleanup_free_ char *from = NULL, *to = NULL;
987         sd_id128_t rnd = {};
988         char as_uuid[37];
989         int r;
990
991         assert(dest);
992
993         if (arg_share_system)
994                 return 0;
995
996         /* Generate a new randomized boot ID, so that each boot-up of
997          * the container gets a new one */
998
999         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1000         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1001         if (!from || !to)
1002                 return log_oom();
1003
1004         r = sd_id128_randomize(&rnd);
1005         if (r < 0) {
1006                 log_error("Failed to generate random boot id: %s", strerror(-r));
1007                 return r;
1008         }
1009
1010         id128_format_as_uuid(rnd, as_uuid);
1011
1012         r = write_string_file(from, as_uuid);
1013         if (r < 0) {
1014                 log_error("Failed to write boot id: %s", strerror(-r));
1015                 return r;
1016         }
1017
1018         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1019                 log_error("Failed to bind mount boot id: %m");
1020                 r = -errno;
1021         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1022                 log_warning("Failed to make boot id read-only: %m");
1023
1024         unlink(from);
1025         return r;
1026 }
1027
1028 static int copy_devnodes(const char *dest) {
1029
1030         static const char devnodes[] =
1031                 "null\0"
1032                 "zero\0"
1033                 "full\0"
1034                 "random\0"
1035                 "urandom\0"
1036                 "tty\0";
1037
1038         const char *d;
1039         int r = 0;
1040         _cleanup_umask_ mode_t u;
1041
1042         assert(dest);
1043
1044         u = umask(0000);
1045
1046         NULSTR_FOREACH(d, devnodes) {
1047                 _cleanup_free_ char *from = NULL, *to = NULL;
1048                 struct stat st;
1049
1050                 from = strappend("/dev/", d);
1051                 to = strjoin(dest, "/dev/", d, NULL);
1052                 if (!from || !to)
1053                         return log_oom();
1054
1055                 if (stat(from, &st) < 0) {
1056
1057                         if (errno != ENOENT) {
1058                                 log_error("Failed to stat %s: %m", from);
1059                                 return -errno;
1060                         }
1061
1062                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1063
1064                         log_error("%s is not a char or block device, cannot copy", from);
1065                         return -EIO;
1066
1067                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1068
1069                         log_error("mknod(%s) failed: %m", dest);
1070                         return  -errno;
1071                 }
1072         }
1073
1074         return r;
1075 }
1076
1077 static int setup_ptmx(const char *dest) {
1078         _cleanup_free_ char *p = NULL;
1079
1080         p = strappend(dest, "/dev/ptmx");
1081         if (!p)
1082                 return log_oom();
1083
1084         if (symlink("pts/ptmx", p) < 0) {
1085                 log_error("Failed to create /dev/ptmx symlink: %m");
1086                 return -errno;
1087         }
1088
1089         return 0;
1090 }
1091
1092 static int setup_dev_console(const char *dest, const char *console) {
1093         _cleanup_umask_ mode_t u;
1094         const char *to;
1095         struct stat st;
1096         int r;
1097
1098         assert(dest);
1099         assert(console);
1100
1101         u = umask(0000);
1102
1103         if (stat("/dev/null", &st) < 0) {
1104                 log_error("Failed to stat /dev/null: %m");
1105                 return -errno;
1106         }
1107
1108         r = chmod_and_chown(console, 0600, 0, 0);
1109         if (r < 0) {
1110                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1111                 return r;
1112         }
1113
1114         /* We need to bind mount the right tty to /dev/console since
1115          * ptys can only exist on pts file systems. To have something
1116          * to bind mount things on we create a device node first, and
1117          * use /dev/null for that since we the cgroups device policy
1118          * allows us to create that freely, while we cannot create
1119          * /dev/console. (Note that the major minor doesn't actually
1120          * matter here, since we mount it over anyway). */
1121
1122         to = strappenda(dest, "/dev/console");
1123         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1124                 log_error("mknod() for /dev/console failed: %m");
1125                 return -errno;
1126         }
1127
1128         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1129                 log_error("Bind mount for /dev/console failed: %m");
1130                 return -errno;
1131         }
1132
1133         return 0;
1134 }
1135
1136 static int setup_kmsg(const char *dest, int kmsg_socket) {
1137         _cleanup_free_ char *from = NULL, *to = NULL;
1138         int r, fd, k;
1139         _cleanup_umask_ mode_t u;
1140         union {
1141                 struct cmsghdr cmsghdr;
1142                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1143         } control = {};
1144         struct msghdr mh = {
1145                 .msg_control = &control,
1146                 .msg_controllen = sizeof(control),
1147         };
1148         struct cmsghdr *cmsg;
1149
1150         assert(dest);
1151         assert(kmsg_socket >= 0);
1152
1153         u = umask(0000);
1154
1155         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1156          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1157          * on the reading side behave very similar to /proc/kmsg,
1158          * their writing side behaves differently from /dev/kmsg in
1159          * that writing blocks when nothing is reading. In order to
1160          * avoid any problems with containers deadlocking due to this
1161          * we simply make /dev/kmsg unavailable to the container. */
1162         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1163             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1164                 return log_oom();
1165
1166         if (mkfifo(from, 0600) < 0) {
1167                 log_error("mkfifo() for /dev/kmsg failed: %m");
1168                 return -errno;
1169         }
1170
1171         r = chmod_and_chown(from, 0600, 0, 0);
1172         if (r < 0) {
1173                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1174                 return r;
1175         }
1176
1177         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1178                 log_error("Bind mount for /proc/kmsg failed: %m");
1179                 return -errno;
1180         }
1181
1182         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1183         if (fd < 0) {
1184                 log_error("Failed to open fifo: %m");
1185                 return -errno;
1186         }
1187
1188         cmsg = CMSG_FIRSTHDR(&mh);
1189         cmsg->cmsg_level = SOL_SOCKET;
1190         cmsg->cmsg_type = SCM_RIGHTS;
1191         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1192         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1193
1194         mh.msg_controllen = cmsg->cmsg_len;
1195
1196         /* Store away the fd in the socket, so that it stays open as
1197          * long as we run the child */
1198         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1199         safe_close(fd);
1200
1201         if (k < 0) {
1202                 log_error("Failed to send FIFO fd: %m");
1203                 return -errno;
1204         }
1205
1206         /* And now make the FIFO unavailable as /dev/kmsg... */
1207         unlink(from);
1208         return 0;
1209 }
1210
1211 static int setup_hostname(void) {
1212
1213         if (arg_share_system)
1214                 return 0;
1215
1216         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1217                 return -errno;
1218
1219         return 0;
1220 }
1221
1222 static int setup_journal(const char *directory) {
1223         sd_id128_t machine_id, this_id;
1224         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1225         char *id;
1226         int r;
1227
1228         p = strappend(directory, "/etc/machine-id");
1229         if (!p)
1230                 return log_oom();
1231
1232         r = read_one_line_file(p, &b);
1233         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1234                 return 0;
1235         else if (r < 0) {
1236                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1237                 return r;
1238         }
1239
1240         id = strstrip(b);
1241         if (isempty(id) && arg_link_journal == LINK_AUTO)
1242                 return 0;
1243
1244         /* Verify validity */
1245         r = sd_id128_from_string(id, &machine_id);
1246         if (r < 0) {
1247                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1248                 return r;
1249         }
1250
1251         r = sd_id128_get_machine(&this_id);
1252         if (r < 0) {
1253                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1254                 return r;
1255         }
1256
1257         if (sd_id128_equal(machine_id, this_id)) {
1258                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1259                          "Host and machine ids are equal (%s): refusing to link journals", id);
1260                 if (arg_link_journal == LINK_AUTO)
1261                         return 0;
1262                 return
1263                         -EEXIST;
1264         }
1265
1266         if (arg_link_journal == LINK_NO)
1267                 return 0;
1268
1269         free(p);
1270         p = strappend("/var/log/journal/", id);
1271         q = strjoin(directory, "/var/log/journal/", id, NULL);
1272         if (!p || !q)
1273                 return log_oom();
1274
1275         if (path_is_mount_point(p, false) > 0) {
1276                 if (arg_link_journal != LINK_AUTO) {
1277                         log_error("%s: already a mount point, refusing to use for journal", p);
1278                         return -EEXIST;
1279                 }
1280
1281                 return 0;
1282         }
1283
1284         if (path_is_mount_point(q, false) > 0) {
1285                 if (arg_link_journal != LINK_AUTO) {
1286                         log_error("%s: already a mount point, refusing to use for journal", q);
1287                         return -EEXIST;
1288                 }
1289
1290                 return 0;
1291         }
1292
1293         r = readlink_and_make_absolute(p, &d);
1294         if (r >= 0) {
1295                 if ((arg_link_journal == LINK_GUEST ||
1296                      arg_link_journal == LINK_AUTO) &&
1297                     path_equal(d, q)) {
1298
1299                         r = mkdir_p(q, 0755);
1300                         if (r < 0)
1301                                 log_warning("failed to create directory %s: %m", q);
1302                         return 0;
1303                 }
1304
1305                 if (unlink(p) < 0) {
1306                         log_error("Failed to remove symlink %s: %m", p);
1307                         return -errno;
1308                 }
1309         } else if (r == -EINVAL) {
1310
1311                 if (arg_link_journal == LINK_GUEST &&
1312                     rmdir(p) < 0) {
1313
1314                         if (errno == ENOTDIR) {
1315                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1316                                 return r;
1317                         } else {
1318                                 log_error("Failed to remove %s: %m", p);
1319                                 return -errno;
1320                         }
1321                 }
1322         } else if (r != -ENOENT) {
1323                 log_error("readlink(%s) failed: %m", p);
1324                 return r;
1325         }
1326
1327         if (arg_link_journal == LINK_GUEST) {
1328
1329                 if (symlink(q, p) < 0) {
1330                         log_error("Failed to symlink %s to %s: %m", q, p);
1331                         return -errno;
1332                 }
1333
1334                 r = mkdir_p(q, 0755);
1335                 if (r < 0)
1336                         log_warning("failed to create directory %s: %m", q);
1337                 return 0;
1338         }
1339
1340         if (arg_link_journal == LINK_HOST) {
1341                 r = mkdir_p(p, 0755);
1342                 if (r < 0) {
1343                         log_error("Failed to create %s: %m", p);
1344                         return r;
1345                 }
1346
1347         } else if (access(p, F_OK) < 0)
1348                 return 0;
1349
1350         if (dir_is_empty(q) == 0)
1351                 log_warning("%s is not empty, proceeding anyway.", q);
1352
1353         r = mkdir_p(q, 0755);
1354         if (r < 0) {
1355                 log_error("Failed to create %s: %m", q);
1356                 return r;
1357         }
1358
1359         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1360                 log_error("Failed to bind mount journal from host into guest: %m");
1361                 return -errno;
1362         }
1363
1364         return 0;
1365 }
1366
1367 static int setup_kdbus(const char *dest, const char *path) {
1368         const char *p;
1369
1370         if (!path)
1371                 return 0;
1372
1373         p = strappenda(dest, "/dev/kdbus");
1374         if (mkdir(p, 0755) < 0) {
1375                 log_error("Failed to create kdbus path: %m");
1376                 return  -errno;
1377         }
1378
1379         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1380                 log_error("Failed to mount kdbus domain path: %m");
1381                 return -errno;
1382         }
1383
1384         return 0;
1385 }
1386
1387 static int drop_capabilities(void) {
1388         return capability_bounding_set_drop(~arg_retain, false);
1389 }
1390
1391 static int register_machine(pid_t pid, int local_ifindex) {
1392         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1393         _cleanup_bus_unref_ sd_bus *bus = NULL;
1394         int r;
1395
1396         if (!arg_register)
1397                 return 0;
1398
1399         r = sd_bus_default_system(&bus);
1400         if (r < 0) {
1401                 log_error("Failed to open system bus: %s", strerror(-r));
1402                 return r;
1403         }
1404
1405         if (arg_keep_unit) {
1406                 r = sd_bus_call_method(
1407                                 bus,
1408                                 "org.freedesktop.machine1",
1409                                 "/org/freedesktop/machine1",
1410                                 "org.freedesktop.machine1.Manager",
1411                                 "RegisterMachineWithNetwork",
1412                                 &error,
1413                                 NULL,
1414                                 "sayssusai",
1415                                 arg_machine,
1416                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1417                                 "nspawn",
1418                                 "container",
1419                                 (uint32_t) pid,
1420                                 strempty(arg_directory),
1421                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1422         } else {
1423                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1424
1425                 r = sd_bus_message_new_method_call(
1426                                 bus,
1427                                 &m,
1428                                 "org.freedesktop.machine1",
1429                                 "/org/freedesktop/machine1",
1430                                 "org.freedesktop.machine1.Manager",
1431                                 "CreateMachineWithNetwork");
1432                 if (r < 0) {
1433                         log_error("Failed to create message: %s", strerror(-r));
1434                         return r;
1435                 }
1436
1437                 r = sd_bus_message_append(
1438                                 m,
1439                                 "sayssusai",
1440                                 arg_machine,
1441                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1442                                 "nspawn",
1443                                 "container",
1444                                 (uint32_t) pid,
1445                                 strempty(arg_directory),
1446                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1447                 if (r < 0) {
1448                         log_error("Failed to append message arguments: %s", strerror(-r));
1449                         return r;
1450                 }
1451
1452                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1453                 if (r < 0) {
1454                         log_error("Failed to open container: %s", strerror(-r));
1455                         return r;
1456                 }
1457
1458                 if (!isempty(arg_slice)) {
1459                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1460                         if (r < 0) {
1461                                 log_error("Failed to append slice: %s", strerror(-r));
1462                                 return r;
1463                         }
1464                 }
1465
1466                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1467                 if (r < 0) {
1468                         log_error("Failed to add device policy: %s", strerror(-r));
1469                         return r;
1470                 }
1471
1472                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1473                                           /* Allow the container to
1474                                            * access and create the API
1475                                            * device nodes, so that
1476                                            * PrivateDevices= in the
1477                                            * container can work
1478                                            * fine */
1479                                           "/dev/null", "rwm",
1480                                           "/dev/zero", "rwm",
1481                                           "/dev/full", "rwm",
1482                                           "/dev/random", "rwm",
1483                                           "/dev/urandom", "rwm",
1484                                           "/dev/tty", "rwm",
1485                                           /* Allow the container
1486                                            * access to ptys. However,
1487                                            * do not permit the
1488                                            * container to ever create
1489                                            * these device nodes. */
1490                                           "/dev/pts/ptmx", "rw",
1491                                           "char-pts", "rw",
1492                                           /* Allow the container
1493                                            * access to all kdbus
1494                                            * devices. Again, the
1495                                            * container cannot create
1496                                            * these nodes, only use
1497                                            * them. We use a pretty
1498                                            * open match here, so that
1499                                            * the kernel API can still
1500                                            * change. */
1501                                           "char-kdbus", "rw",
1502                                           "char-kdbus/*", "rw");
1503                 if (r < 0) {
1504                         log_error("Failed to add device whitelist: %s", strerror(-r));
1505                         return r;
1506                 }
1507
1508                 r = sd_bus_message_close_container(m);
1509                 if (r < 0) {
1510                         log_error("Failed to close container: %s", strerror(-r));
1511                         return r;
1512                 }
1513
1514                 r = sd_bus_call(bus, m, 0, &error, NULL);
1515         }
1516
1517         if (r < 0) {
1518                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1519                 return r;
1520         }
1521
1522         return 0;
1523 }
1524
1525 static int terminate_machine(pid_t pid) {
1526         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1527         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1528         _cleanup_bus_unref_ sd_bus *bus = NULL;
1529         const char *path;
1530         int r;
1531
1532         if (!arg_register)
1533                 return 0;
1534
1535         r = sd_bus_default_system(&bus);
1536         if (r < 0) {
1537                 log_error("Failed to open system bus: %s", strerror(-r));
1538                 return r;
1539         }
1540
1541         r = sd_bus_call_method(
1542                         bus,
1543                         "org.freedesktop.machine1",
1544                         "/org/freedesktop/machine1",
1545                         "org.freedesktop.machine1.Manager",
1546                         "GetMachineByPID",
1547                         &error,
1548                         &reply,
1549                         "u",
1550                         (uint32_t) pid);
1551         if (r < 0) {
1552                 /* Note that the machine might already have been
1553                  * cleaned up automatically, hence don't consider it a
1554                  * failure if we cannot get the machine object. */
1555                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1556                 return 0;
1557         }
1558
1559         r = sd_bus_message_read(reply, "o", &path);
1560         if (r < 0)
1561                 return bus_log_parse_error(r);
1562
1563         r = sd_bus_call_method(
1564                         bus,
1565                         "org.freedesktop.machine1",
1566                         path,
1567                         "org.freedesktop.machine1.Machine",
1568                         "Terminate",
1569                         &error,
1570                         NULL,
1571                         NULL);
1572         if (r < 0) {
1573                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1574                 return 0;
1575         }
1576
1577         return 0;
1578 }
1579
1580 static int reset_audit_loginuid(void) {
1581         _cleanup_free_ char *p = NULL;
1582         int r;
1583
1584         if (arg_share_system)
1585                 return 0;
1586
1587         r = read_one_line_file("/proc/self/loginuid", &p);
1588         if (r == -ENOENT)
1589                 return 0;
1590         if (r < 0) {
1591                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1592                 return r;
1593         }
1594
1595         /* Already reset? */
1596         if (streq(p, "4294967295"))
1597                 return 0;
1598
1599         r = write_string_file("/proc/self/loginuid", "4294967295");
1600         if (r < 0) {
1601                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1602                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1603                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1604                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1605                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1606
1607                 sleep(5);
1608         }
1609
1610         return 0;
1611 }
1612
1613 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1614
1615 static int get_mac(struct ether_addr *mac) {
1616         int r;
1617
1618         uint8_t result[8];
1619         size_t l, sz;
1620         uint8_t *v;
1621
1622         l = strlen(arg_machine);
1623         sz = sizeof(sd_id128_t) + l;
1624         v = alloca(sz);
1625
1626         /* fetch some persistent data unique to the host */
1627         r = sd_id128_get_machine((sd_id128_t*) v);
1628         if (r < 0)
1629                 return r;
1630
1631         /* combine with some data unique (on this host) to this
1632          * container instance */
1633         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1634
1635         /* Let's hash the host machine ID plus the container name. We
1636          * use a fixed, but originally randomly created hash key here. */
1637         siphash24(result, v, sz, HASH_KEY.bytes);
1638
1639         assert_cc(ETH_ALEN <= sizeof(result));
1640         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1641
1642         /* see eth_random_addr in the kernel */
1643         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1644         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1645
1646         return 0;
1647 }
1648
1649 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1650         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1651         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1652         struct ether_addr mac;
1653         int r, i;
1654
1655         if (!arg_private_network)
1656                 return 0;
1657
1658         if (!arg_network_veth)
1659                 return 0;
1660
1661         /* Use two different interface name prefixes depending whether
1662          * we are in bridge mode or not. */
1663         if (arg_network_bridge)
1664                 memcpy(iface_name, "vb-", 3);
1665         else
1666                 memcpy(iface_name, "ve-", 3);
1667         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1668
1669         r = get_mac(&mac);
1670         if (r < 0) {
1671                 log_error("Failed to generate predictable MAC address for host0");
1672                 return r;
1673         }
1674
1675         r = sd_rtnl_open(&rtnl, 0);
1676         if (r < 0) {
1677                 log_error("Failed to connect to netlink: %s", strerror(-r));
1678                 return r;
1679         }
1680
1681         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1682         if (r < 0) {
1683                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1684                 return r;
1685         }
1686
1687         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1688         if (r < 0) {
1689                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1690                 return r;
1691         }
1692
1693         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1694         if (r < 0) {
1695                 log_error("Failed to open netlink container: %s", strerror(-r));
1696                 return r;
1697         }
1698
1699         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1700         if (r < 0) {
1701                 log_error("Failed to open netlink container: %s", strerror(-r));
1702                 return r;
1703         }
1704
1705         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1706         if (r < 0) {
1707                 log_error("Failed to open netlink container: %s", strerror(-r));
1708                 return r;
1709         }
1710
1711         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1712         if (r < 0) {
1713                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1714                 return r;
1715         }
1716
1717         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1718         if (r < 0) {
1719                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1720                 return r;
1721         }
1722
1723         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1724         if (r < 0) {
1725                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1726                 return r;
1727         }
1728
1729         r = sd_rtnl_message_close_container(m);
1730         if (r < 0) {
1731                 log_error("Failed to close netlink container: %s", strerror(-r));
1732                 return r;
1733         }
1734
1735         r = sd_rtnl_message_close_container(m);
1736         if (r < 0) {
1737                 log_error("Failed to close netlink container: %s", strerror(-r));
1738                 return r;
1739         }
1740
1741         r = sd_rtnl_message_close_container(m);
1742         if (r < 0) {
1743                 log_error("Failed to close netlink container: %s", strerror(-r));
1744                 return r;
1745         }
1746
1747         r = sd_rtnl_call(rtnl, m, 0, NULL);
1748         if (r < 0) {
1749                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1750                 return r;
1751         }
1752
1753         i = (int) if_nametoindex(iface_name);
1754         if (i <= 0) {
1755                 log_error("Failed to resolve interface %s: %m", iface_name);
1756                 return -errno;
1757         }
1758
1759         *ifi = i;
1760
1761         return 0;
1762 }
1763
1764 static int setup_bridge(const char veth_name[], int *ifi) {
1765         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1766         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1767         int r, bridge;
1768
1769         if (!arg_private_network)
1770                 return 0;
1771
1772         if (!arg_network_veth)
1773                 return 0;
1774
1775         if (!arg_network_bridge)
1776                 return 0;
1777
1778         bridge = (int) if_nametoindex(arg_network_bridge);
1779         if (bridge <= 0) {
1780                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1781                 return -errno;
1782         }
1783
1784         *ifi = bridge;
1785
1786         r = sd_rtnl_open(&rtnl, 0);
1787         if (r < 0) {
1788                 log_error("Failed to connect to netlink: %s", strerror(-r));
1789                 return r;
1790         }
1791
1792         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1793         if (r < 0) {
1794                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1795                 return r;
1796         }
1797
1798         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1799         if (r < 0) {
1800                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1801                 return r;
1802         }
1803
1804         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1805         if (r < 0) {
1806                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1807                 return r;
1808         }
1809
1810         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1811         if (r < 0) {
1812                 log_error("Failed to add netlink master field: %s", strerror(-r));
1813                 return r;
1814         }
1815
1816         r = sd_rtnl_call(rtnl, m, 0, NULL);
1817         if (r < 0) {
1818                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1819                 return r;
1820         }
1821
1822         return 0;
1823 }
1824
1825 static int parse_interface(struct udev *udev, const char *name) {
1826         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1827         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1828         int ifi;
1829
1830         ifi = (int) if_nametoindex(name);
1831         if (ifi <= 0) {
1832                 log_error("Failed to resolve interface %s: %m", name);
1833                 return -errno;
1834         }
1835
1836         sprintf(ifi_str, "n%i", ifi);
1837         d = udev_device_new_from_device_id(udev, ifi_str);
1838         if (!d) {
1839                 log_error("Failed to get udev device for interface %s: %m", name);
1840                 return -errno;
1841         }
1842
1843         if (udev_device_get_is_initialized(d) <= 0) {
1844                 log_error("Network interface %s is not initialized yet.", name);
1845                 return -EBUSY;
1846         }
1847
1848         return ifi;
1849 }
1850
1851 static int move_network_interfaces(pid_t pid) {
1852         _cleanup_udev_unref_ struct udev *udev = NULL;
1853         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1854         char **i;
1855         int r;
1856
1857         if (!arg_private_network)
1858                 return 0;
1859
1860         if (strv_isempty(arg_network_interfaces))
1861                 return 0;
1862
1863         r = sd_rtnl_open(&rtnl, 0);
1864         if (r < 0) {
1865                 log_error("Failed to connect to netlink: %s", strerror(-r));
1866                 return r;
1867         }
1868
1869         udev = udev_new();
1870         if (!udev) {
1871                 log_error("Failed to connect to udev.");
1872                 return -ENOMEM;
1873         }
1874
1875         STRV_FOREACH(i, arg_network_interfaces) {
1876                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1877                 int ifi;
1878
1879                 ifi = parse_interface(udev, *i);
1880                 if (ifi < 0)
1881                         return ifi;
1882
1883                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1884                 if (r < 0) {
1885                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1886                         return r;
1887                 }
1888
1889                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1890                 if (r < 0) {
1891                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1892                         return r;
1893                 }
1894
1895                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1896                 if (r < 0) {
1897                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1898                         return r;
1899                 }
1900         }
1901
1902         return 0;
1903 }
1904
1905 static int setup_macvlan(pid_t pid) {
1906         _cleanup_udev_unref_ struct udev *udev = NULL;
1907         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1908         char **i;
1909         int r;
1910
1911         if (!arg_private_network)
1912                 return 0;
1913
1914         if (strv_isempty(arg_network_macvlan))
1915                 return 0;
1916
1917         r = sd_rtnl_open(&rtnl, 0);
1918         if (r < 0) {
1919                 log_error("Failed to connect to netlink: %s", strerror(-r));
1920                 return r;
1921         }
1922
1923         udev = udev_new();
1924         if (!udev) {
1925                 log_error("Failed to connect to udev.");
1926                 return -ENOMEM;
1927         }
1928
1929         STRV_FOREACH(i, arg_network_macvlan) {
1930                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1931                 _cleanup_free_ char *n = NULL;
1932                 int ifi;
1933
1934                 ifi = parse_interface(udev, *i);
1935                 if (ifi < 0)
1936                         return ifi;
1937
1938                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1939                 if (r < 0) {
1940                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1941                         return r;
1942                 }
1943
1944                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1945                 if (r < 0) {
1946                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1947                         return r;
1948                 }
1949
1950                 n = strappend("mv-", *i);
1951                 if (!n)
1952                         return log_oom();
1953
1954                 strshorten(n, IFNAMSIZ-1);
1955
1956                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1957                 if (r < 0) {
1958                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1959                         return r;
1960                 }
1961
1962                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1963                 if (r < 0) {
1964                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1965                         return r;
1966                 }
1967
1968                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1969                 if (r < 0) {
1970                         log_error("Failed to open netlink container: %s", strerror(-r));
1971                         return r;
1972                 }
1973
1974                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1975                 if (r < 0) {
1976                         log_error("Failed to open netlink container: %s", strerror(-r));
1977                         return r;
1978                 }
1979
1980                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1981                 if (r < 0) {
1982                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1983                         return r;
1984                 }
1985
1986                 r = sd_rtnl_message_close_container(m);
1987                 if (r < 0) {
1988                         log_error("Failed to close netlink container: %s", strerror(-r));
1989                         return r;
1990                 }
1991
1992                 r = sd_rtnl_message_close_container(m);
1993                 if (r < 0) {
1994                         log_error("Failed to close netlink container: %s", strerror(-r));
1995                         return r;
1996                 }
1997
1998                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1999                 if (r < 0) {
2000                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2001                         return r;
2002                 }
2003         }
2004
2005         return 0;
2006 }
2007
2008 static int setup_seccomp(void) {
2009
2010 #ifdef HAVE_SECCOMP
2011         static const int blacklist[] = {
2012                 SCMP_SYS(kexec_load),
2013                 SCMP_SYS(open_by_handle_at),
2014                 SCMP_SYS(init_module),
2015                 SCMP_SYS(finit_module),
2016                 SCMP_SYS(delete_module),
2017                 SCMP_SYS(iopl),
2018                 SCMP_SYS(ioperm),
2019                 SCMP_SYS(swapon),
2020                 SCMP_SYS(swapoff),
2021         };
2022
2023         scmp_filter_ctx seccomp;
2024         unsigned i;
2025         int r;
2026
2027         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2028         if (!seccomp)
2029                 return log_oom();
2030
2031         r = seccomp_add_secondary_archs(seccomp);
2032         if (r < 0) {
2033                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2034                 goto finish;
2035         }
2036
2037         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2038                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2039                 if (r == -EFAULT)
2040                         continue; /* unknown syscall */
2041                 if (r < 0) {
2042                         log_error("Failed to block syscall: %s", strerror(-r));
2043                         goto finish;
2044                 }
2045         }
2046
2047         /*
2048            Audit is broken in containers, much of the userspace audit
2049            hookup will fail if running inside a container. We don't
2050            care and just turn off creation of audit sockets.
2051
2052            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2053            with EAFNOSUPPORT which audit userspace uses as indication
2054            that audit is disabled in the kernel.
2055          */
2056
2057         r = seccomp_rule_add(
2058                         seccomp,
2059                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2060                         SCMP_SYS(socket),
2061                         2,
2062                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2063                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2064         if (r < 0) {
2065                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2066                 goto finish;
2067         }
2068
2069         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2070         if (r < 0) {
2071                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2072                 goto finish;
2073         }
2074
2075         r = seccomp_load(seccomp);
2076         if (r < 0)
2077                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2078
2079 finish:
2080         seccomp_release(seccomp);
2081         return r;
2082 #else
2083         return 0;
2084 #endif
2085
2086 }
2087
2088 static int setup_image(char **device_path, int *loop_nr) {
2089         struct loop_info64 info = {
2090                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2091         };
2092         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2093         _cleanup_free_ char* loopdev = NULL;
2094         struct stat st;
2095         int r, nr;
2096
2097         assert(device_path);
2098         assert(loop_nr);
2099
2100         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2101         if (fd < 0) {
2102                 log_error("Failed to open %s: %m", arg_image);
2103                 return -errno;
2104         }
2105
2106         if (fstat(fd, &st) < 0) {
2107                 log_error("Failed to stat %s: %m", arg_image);
2108                 return -errno;
2109         }
2110
2111         if (S_ISBLK(st.st_mode)) {
2112                 char *p;
2113
2114                 p = strdup(arg_image);
2115                 if (!p)
2116                         return log_oom();
2117
2118                 *device_path = p;
2119
2120                 *loop_nr = -1;
2121
2122                 r = fd;
2123                 fd = -1;
2124
2125                 return r;
2126         }
2127
2128         if (!S_ISREG(st.st_mode)) {
2129                 log_error("%s is not a regular file or block device: %m", arg_image);
2130                 return -EINVAL;
2131         }
2132
2133         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2134         if (control < 0) {
2135                 log_error("Failed to open /dev/loop-control: %m");
2136                 return -errno;
2137         }
2138
2139         nr = ioctl(control, LOOP_CTL_GET_FREE);
2140         if (nr < 0) {
2141                 log_error("Failed to allocate loop device: %m");
2142                 return -errno;
2143         }
2144
2145         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2146                 return log_oom();
2147
2148         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2149         if (loop < 0) {
2150                 log_error("Failed to open loop device %s: %m", loopdev);
2151                 return -errno;
2152         }
2153
2154         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2155                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2156                 return -errno;
2157         }
2158
2159         if (arg_read_only)
2160                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2161
2162         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2163                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2164                 return -errno;
2165         }
2166
2167         *device_path = loopdev;
2168         loopdev = NULL;
2169
2170         *loop_nr = nr;
2171
2172         r = loop;
2173         loop = -1;
2174
2175         return r;
2176 }
2177
2178 static int dissect_image(
2179                 int fd,
2180                 char **root_device, bool *root_device_rw,
2181                 char **home_device, bool *home_device_rw,
2182                 char **srv_device, bool *srv_device_rw,
2183                 bool *secondary) {
2184
2185 #ifdef HAVE_BLKID
2186         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2187         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2188         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2189         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2190         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2191         _cleanup_udev_unref_ struct udev *udev = NULL;
2192         struct udev_list_entry *first, *item;
2193         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2194         const char *pttype = NULL;
2195         blkid_partlist pl;
2196         struct stat st;
2197         int r;
2198
2199         assert(fd >= 0);
2200         assert(root_device);
2201         assert(home_device);
2202         assert(srv_device);
2203         assert(secondary);
2204
2205         b = blkid_new_probe();
2206         if (!b)
2207                 return log_oom();
2208
2209         errno = 0;
2210         r = blkid_probe_set_device(b, fd, 0, 0);
2211         if (r != 0) {
2212                 if (errno == 0)
2213                         return log_oom();
2214
2215                 log_error("Failed to set device on blkid probe: %m");
2216                 return -errno;
2217         }
2218
2219         blkid_probe_enable_partitions(b, 1);
2220         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2221
2222         errno = 0;
2223         r = blkid_do_safeprobe(b);
2224         if (r == -2 || r == 1) {
2225                 log_error("Failed to identify any partition table on %s.\n"
2226                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2227                 return -EINVAL;
2228         } else if (r != 0) {
2229                 if (errno == 0)
2230                         errno = EIO;
2231                 log_error("Failed to probe: %m");
2232                 return -errno;
2233         }
2234
2235         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2236         if (!streq_ptr(pttype, "gpt")) {
2237                 log_error("Image %s does not carry a GUID Partition Table.\n"
2238                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2239                 return -EINVAL;
2240         }
2241
2242         errno = 0;
2243         pl = blkid_probe_get_partitions(b);
2244         if (!pl) {
2245                 if (errno == 0)
2246                         return log_oom();
2247
2248                 log_error("Failed to list partitions of %s", arg_image);
2249                 return -errno;
2250         }
2251
2252         udev = udev_new();
2253         if (!udev)
2254                 return log_oom();
2255
2256         if (fstat(fd, &st) < 0) {
2257                 log_error("Failed to stat block device: %m");
2258                 return -errno;
2259         }
2260
2261         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2262         if (!d)
2263                 return log_oom();
2264
2265         e = udev_enumerate_new(udev);
2266         if (!e)
2267                 return log_oom();
2268
2269         r = udev_enumerate_add_match_parent(e, d);
2270         if (r < 0)
2271                 return log_oom();
2272
2273         r = udev_enumerate_scan_devices(e);
2274         if (r < 0) {
2275                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2276                 return r;
2277         }
2278
2279         first = udev_enumerate_get_list_entry(e);
2280         udev_list_entry_foreach(item, first) {
2281                 _cleanup_udev_device_unref_ struct udev_device *q;
2282                 const char *stype, *node;
2283                 unsigned long long flags;
2284                 sd_id128_t type_id;
2285                 blkid_partition pp;
2286                 dev_t qn;
2287                 int nr;
2288
2289                 errno = 0;
2290                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2291                 if (!q) {
2292                         if (!errno)
2293                                 errno = ENOMEM;
2294
2295                         log_error("Failed to get partition device of %s: %m", arg_image);
2296                         return -errno;
2297                 }
2298
2299                 qn = udev_device_get_devnum(q);
2300                 if (major(qn) == 0)
2301                         continue;
2302
2303                 if (st.st_rdev == qn)
2304                         continue;
2305
2306                 node = udev_device_get_devnode(q);
2307                 if (!node)
2308                         continue;
2309
2310                 pp = blkid_partlist_devno_to_partition(pl, qn);
2311                 if (!pp)
2312                         continue;
2313
2314                 flags = blkid_partition_get_flags(pp);
2315                 if (flags & GPT_FLAG_NO_AUTO)
2316                         continue;
2317
2318                 nr = blkid_partition_get_partno(pp);
2319                 if (nr < 0)
2320                         continue;
2321
2322                 stype = blkid_partition_get_type_string(pp);
2323                 if (!stype)
2324                         continue;
2325
2326                 if (sd_id128_from_string(stype, &type_id) < 0)
2327                         continue;
2328
2329                 if (sd_id128_equal(type_id, GPT_HOME)) {
2330
2331                         if (home && nr >= home_nr)
2332                                 continue;
2333
2334                         home_nr = nr;
2335                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2336
2337                         free(home);
2338                         home = strdup(node);
2339                         if (!home)
2340                                 return log_oom();
2341                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2342
2343                         if (srv && nr >= srv_nr)
2344                                 continue;
2345
2346                         srv_nr = nr;
2347                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2348
2349                         free(srv);
2350                         srv = strdup(node);
2351                         if (!srv)
2352                                 return log_oom();
2353                 }
2354 #ifdef GPT_ROOT_NATIVE
2355                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2356
2357                         if (root && nr >= root_nr)
2358                                 continue;
2359
2360                         root_nr = nr;
2361                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2362
2363                         free(root);
2364                         root = strdup(node);
2365                         if (!root)
2366                                 return log_oom();
2367                 }
2368 #endif
2369 #ifdef GPT_ROOT_SECONDARY
2370                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2371
2372                         if (secondary_root && nr >= secondary_root_nr)
2373                                 continue;
2374
2375                         secondary_root_nr = nr;
2376                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2377
2378
2379                         free(secondary_root);
2380                         secondary_root = strdup(node);
2381                         if (!secondary_root)
2382                                 return log_oom();
2383                 }
2384 #endif
2385         }
2386
2387         if (!root && !secondary_root) {
2388                 log_error("Failed to identify root partition in disk image %s.\n"
2389                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2390                 return -EINVAL;
2391         }
2392
2393         if (root) {
2394                 *root_device = root;
2395                 root = NULL;
2396
2397                 *root_device_rw = root_rw;
2398                 *secondary = false;
2399         } else if (secondary_root) {
2400                 *root_device = secondary_root;
2401                 secondary_root = NULL;
2402
2403                 *root_device_rw = secondary_root_rw;
2404                 *secondary = true;
2405         }
2406
2407         if (home) {
2408                 *home_device = home;
2409                 home = NULL;
2410
2411                 *home_device_rw = home_rw;
2412         }
2413
2414         if (srv) {
2415                 *srv_device = srv;
2416                 srv = NULL;
2417
2418                 *srv_device_rw = srv_rw;
2419         }
2420
2421         return 0;
2422 #else
2423         log_error("--image= is not supported, compiled without blkid support.");
2424         return -ENOTSUP;
2425 #endif
2426 }
2427
2428 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2429 #ifdef HAVE_BLKID
2430         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2431         const char *fstype, *p;
2432         int r;
2433
2434         assert(what);
2435         assert(where);
2436
2437         if (arg_read_only)
2438                 rw = false;
2439
2440         if (directory)
2441                 p = strappenda(where, directory);
2442         else
2443                 p = where;
2444
2445         errno = 0;
2446         b = blkid_new_probe_from_filename(what);
2447         if (!b) {
2448                 if (errno == 0)
2449                         return log_oom();
2450                 log_error("Failed to allocate prober for %s: %m", what);
2451                 return -errno;
2452         }
2453
2454         blkid_probe_enable_superblocks(b, 1);
2455         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2456
2457         errno = 0;
2458         r = blkid_do_safeprobe(b);
2459         if (r == -1 || r == 1) {
2460                 log_error("Cannot determine file system type of %s", what);
2461                 return -EINVAL;
2462         } else if (r != 0) {
2463                 if (errno == 0)
2464                         errno = EIO;
2465                 log_error("Failed to probe %s: %m", what);
2466                 return -errno;
2467         }
2468
2469         errno = 0;
2470         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2471                 if (errno == 0)
2472                         errno = EINVAL;
2473                 log_error("Failed to determine file system type of %s", what);
2474                 return -errno;
2475         }
2476
2477         if (streq(fstype, "crypto_LUKS")) {
2478                 log_error("nspawn currently does not support LUKS disk images.");
2479                 return -ENOTSUP;
2480         }
2481
2482         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2483                 log_error("Failed to mount %s: %m", what);
2484                 return -errno;
2485         }
2486
2487         return 0;
2488 #else
2489         log_error("--image= is not supported, compiled without blkid support.");
2490         return -ENOTSUP;
2491 #endif
2492 }
2493
2494 static int mount_devices(
2495                 const char *where,
2496                 const char *root_device, bool root_device_rw,
2497                 const char *home_device, bool home_device_rw,
2498                 const char *srv_device, bool srv_device_rw) {
2499         int r;
2500
2501         assert(where);
2502
2503         if (root_device) {
2504                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2505                 if (r < 0) {
2506                         log_error("Failed to mount root directory: %s", strerror(-r));
2507                         return r;
2508                 }
2509         }
2510
2511         if (home_device) {
2512                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2513                 if (r < 0) {
2514                         log_error("Failed to mount home directory: %s", strerror(-r));
2515                         return r;
2516                 }
2517         }
2518
2519         if (srv_device) {
2520                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2521                 if (r < 0) {
2522                         log_error("Failed to mount server data directory: %s", strerror(-r));
2523                         return r;
2524                 }
2525         }
2526
2527         return 0;
2528 }
2529
2530 static void loop_remove(int nr, int *image_fd) {
2531         _cleanup_close_ int control = -1;
2532
2533         if (nr < 0)
2534                 return;
2535
2536         if (image_fd && *image_fd >= 0) {
2537                 ioctl(*image_fd, LOOP_CLR_FD);
2538                 *image_fd = safe_close(*image_fd);
2539         }
2540
2541         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2542         if (control < 0)
2543                 return;
2544
2545         ioctl(control, LOOP_CTL_REMOVE, nr);
2546 }
2547
2548 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2549         int pipe_fds[2];
2550         pid_t pid;
2551
2552         assert(database);
2553         assert(key);
2554         assert(rpid);
2555
2556         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2557                 log_error("Failed to allocate pipe: %m");
2558                 return -errno;
2559         }
2560
2561         pid = fork();
2562         if (pid < 0) {
2563                 log_error("Failed to fork getent child: %m");
2564                 return -errno;
2565         } else if (pid == 0) {
2566                 int nullfd;
2567                 char *empty_env = NULL;
2568
2569                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2570                         _exit(EXIT_FAILURE);
2571
2572                 if (pipe_fds[0] > 2)
2573                         safe_close(pipe_fds[0]);
2574                 if (pipe_fds[1] > 2)
2575                         safe_close(pipe_fds[1]);
2576
2577                 nullfd = open("/dev/null", O_RDWR);
2578                 if (nullfd < 0)
2579                         _exit(EXIT_FAILURE);
2580
2581                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2582                         _exit(EXIT_FAILURE);
2583
2584                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2585                         _exit(EXIT_FAILURE);
2586
2587                 if (nullfd > 2)
2588                         safe_close(nullfd);
2589
2590                 reset_all_signal_handlers();
2591                 close_all_fds(NULL, 0);
2592
2593                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2594                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2595                 _exit(EXIT_FAILURE);
2596         }
2597
2598         pipe_fds[1] = safe_close(pipe_fds[1]);
2599
2600         *rpid = pid;
2601
2602         return pipe_fds[0];
2603 }
2604
2605 static int change_uid_gid(char **_home) {
2606         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2607         _cleanup_free_ uid_t *uids = NULL;
2608         _cleanup_free_ char *home = NULL;
2609         _cleanup_fclose_ FILE *f = NULL;
2610         _cleanup_close_ int fd = -1;
2611         unsigned n_uids = 0;
2612         size_t sz = 0, l;
2613         uid_t uid;
2614         gid_t gid;
2615         pid_t pid;
2616         int r;
2617
2618         assert(_home);
2619
2620         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2621                 /* Reset everything fully to 0, just in case */
2622
2623                 if (setgroups(0, NULL) < 0) {
2624                         log_error("setgroups() failed: %m");
2625                         return -errno;
2626                 }
2627
2628                 if (setresgid(0, 0, 0) < 0) {
2629                         log_error("setregid() failed: %m");
2630                         return -errno;
2631                 }
2632
2633                 if (setresuid(0, 0, 0) < 0) {
2634                         log_error("setreuid() failed: %m");
2635                         return -errno;
2636                 }
2637
2638                 *_home = NULL;
2639                 return 0;
2640         }
2641
2642         /* First, get user credentials */
2643         fd = spawn_getent("passwd", arg_user, &pid);
2644         if (fd < 0)
2645                 return fd;
2646
2647         f = fdopen(fd, "r");
2648         if (!f)
2649                 return log_oom();
2650         fd = -1;
2651
2652         if (!fgets(line, sizeof(line), f)) {
2653
2654                 if (!ferror(f)) {
2655                         log_error("Failed to resolve user %s.", arg_user);
2656                         return -ESRCH;
2657                 }
2658
2659                 log_error("Failed to read from getent: %m");
2660                 return -errno;
2661         }
2662
2663         truncate_nl(line);
2664
2665         wait_for_terminate_and_warn("getent passwd", pid);
2666
2667         x = strchr(line, ':');
2668         if (!x) {
2669                 log_error("/etc/passwd entry has invalid user field.");
2670                 return -EIO;
2671         }
2672
2673         u = strchr(x+1, ':');
2674         if (!u) {
2675                 log_error("/etc/passwd entry has invalid password field.");
2676                 return -EIO;
2677         }
2678
2679         u++;
2680         g = strchr(u, ':');
2681         if (!g) {
2682                 log_error("/etc/passwd entry has invalid UID field.");
2683                 return -EIO;
2684         }
2685
2686         *g = 0;
2687         g++;
2688         x = strchr(g, ':');
2689         if (!x) {
2690                 log_error("/etc/passwd entry has invalid GID field.");
2691                 return -EIO;
2692         }
2693
2694         *x = 0;
2695         h = strchr(x+1, ':');
2696         if (!h) {
2697                 log_error("/etc/passwd entry has invalid GECOS field.");
2698                 return -EIO;
2699         }
2700
2701         h++;
2702         x = strchr(h, ':');
2703         if (!x) {
2704                 log_error("/etc/passwd entry has invalid home directory field.");
2705                 return -EIO;
2706         }
2707
2708         *x = 0;
2709
2710         r = parse_uid(u, &uid);
2711         if (r < 0) {
2712                 log_error("Failed to parse UID of user.");
2713                 return -EIO;
2714         }
2715
2716         r = parse_gid(g, &gid);
2717         if (r < 0) {
2718                 log_error("Failed to parse GID of user.");
2719                 return -EIO;
2720         }
2721
2722         home = strdup(h);
2723         if (!home)
2724                 return log_oom();
2725
2726         /* Second, get group memberships */
2727         fd = spawn_getent("initgroups", arg_user, &pid);
2728         if (fd < 0)
2729                 return fd;
2730
2731         fclose(f);
2732         f = fdopen(fd, "r");
2733         if (!f)
2734                 return log_oom();
2735         fd = -1;
2736
2737         if (!fgets(line, sizeof(line), f)) {
2738                 if (!ferror(f)) {
2739                         log_error("Failed to resolve user %s.", arg_user);
2740                         return -ESRCH;
2741                 }
2742
2743                 log_error("Failed to read from getent: %m");
2744                 return -errno;
2745         }
2746
2747         truncate_nl(line);
2748
2749         wait_for_terminate_and_warn("getent initgroups", pid);
2750
2751         /* Skip over the username and subsequent separator whitespace */
2752         x = line;
2753         x += strcspn(x, WHITESPACE);
2754         x += strspn(x, WHITESPACE);
2755
2756         FOREACH_WORD(w, l, x, state) {
2757                 char c[l+1];
2758
2759                 memcpy(c, w, l);
2760                 c[l] = 0;
2761
2762                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2763                         return log_oom();
2764
2765                 r = parse_uid(c, &uids[n_uids++]);
2766                 if (r < 0) {
2767                         log_error("Failed to parse group data from getent.");
2768                         return -EIO;
2769                 }
2770         }
2771
2772         r = mkdir_parents(home, 0775);
2773         if (r < 0) {
2774                 log_error("Failed to make home root directory: %s", strerror(-r));
2775                 return r;
2776         }
2777
2778         r = mkdir_safe(home, 0755, uid, gid);
2779         if (r < 0 && r != -EEXIST) {
2780                 log_error("Failed to make home directory: %s", strerror(-r));
2781                 return r;
2782         }
2783
2784         fchown(STDIN_FILENO, uid, gid);
2785         fchown(STDOUT_FILENO, uid, gid);
2786         fchown(STDERR_FILENO, uid, gid);
2787
2788         if (setgroups(n_uids, uids) < 0) {
2789                 log_error("Failed to set auxiliary groups: %m");
2790                 return -errno;
2791         }
2792
2793         if (setresgid(gid, gid, gid) < 0) {
2794                 log_error("setregid() failed: %m");
2795                 return -errno;
2796         }
2797
2798         if (setresuid(uid, uid, uid) < 0) {
2799                 log_error("setreuid() failed: %m");
2800                 return -errno;
2801         }
2802
2803         if (_home) {
2804                 *_home = home;
2805                 home = NULL;
2806         }
2807
2808         return 0;
2809 }
2810
2811 /*
2812  * Return values:
2813  * < 0 : wait_for_terminate() failed to get the state of the
2814  *       container, the container was terminated by a signal, or
2815  *       failed for an unknown reason.  No change is made to the
2816  *       container argument.
2817  * > 0 : The program executed in the container terminated with an
2818  *       error.  The exit code of the program executed in the
2819  *       container is returned.  No change is made to the container
2820  *       argument.
2821  *   0 : The container is being rebooted, has been shut down or exited
2822  *       successfully.  The container argument has been set to either
2823  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2824  *
2825  * That is, success is indicated by a return value of zero, and an
2826  * error is indicated by a non-zero value.
2827  */
2828 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2829         int r;
2830         siginfo_t status;
2831
2832         r = wait_for_terminate(pid, &status);
2833         if (r < 0) {
2834                 log_warning("Failed to wait for container: %s", strerror(-r));
2835                 return r;
2836         }
2837
2838         switch (status.si_code) {
2839         case CLD_EXITED:
2840                 r = status.si_status;
2841                 if (r == 0) {
2842                         if (!arg_quiet)
2843                                 log_debug("Container %s exited successfully.",
2844                                           arg_machine);
2845
2846                         *container = CONTAINER_TERMINATED;
2847                 } else {
2848                         log_error("Container %s failed with error code %i.",
2849                                   arg_machine, status.si_status);
2850                 }
2851                 break;
2852
2853         case CLD_KILLED:
2854                 if (status.si_status == SIGINT) {
2855                         if (!arg_quiet)
2856                                 log_info("Container %s has been shut down.",
2857                                          arg_machine);
2858
2859                         *container = CONTAINER_TERMINATED;
2860                         r = 0;
2861                         break;
2862                 } else if (status.si_status == SIGHUP) {
2863                         if (!arg_quiet)
2864                                 log_info("Container %s is being rebooted.",
2865                                          arg_machine);
2866
2867                         *container = CONTAINER_REBOOTED;
2868                         r = 0;
2869                         break;
2870                 }
2871                 /* CLD_KILLED fallthrough */
2872
2873         case CLD_DUMPED:
2874                 log_error("Container %s terminated by signal %s.",
2875                           arg_machine, signal_to_string(status.si_status));
2876                 r = -1;
2877                 break;
2878
2879         default:
2880                 log_error("Container %s failed due to unknown reason.",
2881                           arg_machine);
2882                 r = -1;
2883                 break;
2884         }
2885
2886         return r;
2887 }
2888
2889 static void nop_handler(int sig) {}
2890
2891 int main(int argc, char *argv[]) {
2892
2893         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2894         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2895         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2896         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2897         _cleanup_fdset_free_ FDSet *fds = NULL;
2898         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2899         const char *console = NULL;
2900         char veth_name[IFNAMSIZ];
2901         bool secondary = false;
2902         sigset_t mask, mask_chld;
2903         pid_t pid = 0;
2904
2905         log_parse_environment();
2906         log_open();
2907
2908         k = parse_argv(argc, argv);
2909         if (k < 0)
2910                 goto finish;
2911         else if (k == 0) {
2912                 r = EXIT_SUCCESS;
2913                 goto finish;
2914         }
2915
2916         if (!arg_image) {
2917                 if (arg_directory) {
2918                         char *p;
2919
2920                         p = path_make_absolute_cwd(arg_directory);
2921                         free(arg_directory);
2922                         arg_directory = p;
2923                 } else
2924                         arg_directory = get_current_dir_name();
2925
2926                 if (!arg_directory) {
2927                         log_error("Failed to determine path, please use -D.");
2928                         goto finish;
2929                 }
2930                 path_kill_slashes(arg_directory);
2931         }
2932
2933         if (!arg_machine) {
2934                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2935                 if (!arg_machine) {
2936                         log_oom();
2937                         goto finish;
2938                 }
2939
2940                 hostname_cleanup(arg_machine, false);
2941                 if (isempty(arg_machine)) {
2942                         log_error("Failed to determine machine name automatically, please use -M.");
2943                         goto finish;
2944                 }
2945         }
2946
2947         if (geteuid() != 0) {
2948                 log_error("Need to be root.");
2949                 goto finish;
2950         }
2951
2952         if (sd_booted() <= 0) {
2953                 log_error("Not running on a systemd system.");
2954                 goto finish;
2955         }
2956
2957         log_close();
2958         n_fd_passed = sd_listen_fds(false);
2959         if (n_fd_passed > 0) {
2960                 k = fdset_new_listen_fds(&fds, false);
2961                 if (k < 0) {
2962                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2963                         goto finish;
2964                 }
2965         }
2966         fdset_close_others(fds);
2967         log_open();
2968
2969         if (arg_directory) {
2970                 if (path_equal(arg_directory, "/")) {
2971                         log_error("Spawning container on root directory not supported.");
2972                         goto finish;
2973                 }
2974
2975                 if (arg_boot) {
2976                         if (path_is_os_tree(arg_directory) <= 0) {
2977                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2978                                 goto finish;
2979                         }
2980                 } else {
2981                         const char *p;
2982
2983                         p = strappenda(arg_directory,
2984                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2985                         if (access(p, F_OK) < 0) {
2986                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2987                                 goto finish;
2988
2989                         }
2990                 }
2991         } else {
2992                 char template[] = "/tmp/nspawn-root-XXXXXX";
2993
2994                 if (!mkdtemp(template)) {
2995                         log_error("Failed to create temporary directory: %m");
2996                         r = -errno;
2997                         goto finish;
2998                 }
2999
3000                 arg_directory = strdup(template);
3001                 if (!arg_directory) {
3002                         r = log_oom();
3003                         goto finish;
3004                 }
3005
3006                 image_fd = setup_image(&device_path, &loop_nr);
3007                 if (image_fd < 0) {
3008                         r = image_fd;
3009                         goto finish;
3010                 }
3011
3012                 r = dissect_image(image_fd,
3013                                   &root_device, &root_device_rw,
3014                                   &home_device, &home_device_rw,
3015                                   &srv_device, &srv_device_rw,
3016                                   &secondary);
3017                 if (r < 0)
3018                         goto finish;
3019         }
3020
3021         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3022         if (master < 0) {
3023                 log_error("Failed to acquire pseudo tty: %m");
3024                 goto finish;
3025         }
3026
3027         console = ptsname(master);
3028         if (!console) {
3029                 log_error("Failed to determine tty name: %m");
3030                 goto finish;
3031         }
3032
3033         if (!arg_quiet)
3034                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3035                          arg_machine, arg_image ? arg_image : arg_directory);
3036
3037         if (unlockpt(master) < 0) {
3038                 log_error("Failed to unlock tty: %m");
3039                 goto finish;
3040         }
3041
3042         if (access("/dev/kdbus/control", F_OK) >= 0) {
3043
3044                 if (arg_share_system) {
3045                         kdbus_domain = strdup("/dev/kdbus");
3046                         if (!kdbus_domain) {
3047                                 log_oom();
3048                                 goto finish;
3049                         }
3050                 } else {
3051                         const char *ns;
3052
3053                         ns = strappenda("machine-", arg_machine);
3054                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3055                         if (r < 0)
3056                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3057                         else
3058                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3059                 }
3060         }
3061
3062         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3063                 log_error("Failed to create kmsg socket pair: %m");
3064                 goto finish;
3065         }
3066
3067         sd_notify(0, "READY=1");
3068
3069         assert_se(sigemptyset(&mask) == 0);
3070         assert_se(sigemptyset(&mask_chld) == 0);
3071         sigaddset(&mask_chld, SIGCHLD);
3072         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3073         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3074
3075         for (;;) {
3076                 ContainerStatus container_status;
3077                 int eventfds[2] = { -1, -1 };
3078                 struct sigaction sa = {
3079                         .sa_handler = nop_handler,
3080                         .sa_flags = SA_NOCLDSTOP,
3081                 };
3082
3083                 /* Child can be killed before execv(), so handle SIGCHLD
3084                  * in order to interrupt parent's blocking calls and
3085                  * give it a chance to call wait() and terminate. */
3086                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3087                 if (r < 0) {
3088                         log_error("Failed to change the signal mask: %m");
3089                         goto finish;
3090                 }
3091
3092                 r = sigaction(SIGCHLD, &sa, NULL);
3093                 if (r < 0) {
3094                         log_error("Failed to install SIGCHLD handler: %m");
3095                         goto finish;
3096                 }
3097
3098                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
3099                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3100                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
3101                 if (pid < 0) {
3102                         if (errno == EINVAL)
3103                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3104                         else
3105                                 log_error("clone() failed: %m");
3106
3107                         r = pid;
3108                         goto finish;
3109                 }
3110
3111                 if (pid == 0) {
3112                         /* child */
3113                         _cleanup_free_ char *home = NULL;
3114                         unsigned n_env = 2;
3115                         const char *envp[] = {
3116                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3117                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3118                                 NULL, /* TERM */
3119                                 NULL, /* HOME */
3120                                 NULL, /* USER */
3121                                 NULL, /* LOGNAME */
3122                                 NULL, /* container_uuid */
3123                                 NULL, /* LISTEN_FDS */
3124                                 NULL, /* LISTEN_PID */
3125                                 NULL
3126                         };
3127                         char **env_use;
3128
3129                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3130                         if (envp[n_env])
3131                                 n_env ++;
3132
3133                         master = safe_close(master);
3134
3135                         close_nointr(STDIN_FILENO);
3136                         close_nointr(STDOUT_FILENO);
3137                         close_nointr(STDERR_FILENO);
3138
3139                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3140
3141                         reset_all_signal_handlers();
3142
3143                         assert_se(sigemptyset(&mask) == 0);
3144                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3145
3146                         k = open_terminal(console, O_RDWR);
3147                         if (k != STDIN_FILENO) {
3148                                 if (k >= 0) {
3149                                         safe_close(k);
3150                                         k = -EINVAL;
3151                                 }
3152
3153                                 log_error("Failed to open console: %s", strerror(-k));
3154                                 goto child_fail;
3155                         }
3156
3157                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3158                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3159                                 log_error("Failed to duplicate console: %m");
3160                                 goto child_fail;
3161                         }
3162
3163                         if (setsid() < 0) {
3164                                 log_error("setsid() failed: %m");
3165                                 goto child_fail;
3166                         }
3167
3168                         if (reset_audit_loginuid() < 0)
3169                                 goto child_fail;
3170
3171                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3172                                 log_error("PR_SET_PDEATHSIG failed: %m");
3173                                 goto child_fail;
3174                         }
3175
3176                         /* Mark everything as slave, so that we still
3177                          * receive mounts from the real root, but don't
3178                          * propagate mounts to the real root. */
3179                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3180                                 log_error("MS_SLAVE|MS_REC failed: %m");
3181                                 goto child_fail;
3182                         }
3183
3184                         if (mount_devices(arg_directory,
3185                                           root_device, root_device_rw,
3186                                           home_device, home_device_rw,
3187                                           srv_device, srv_device_rw) < 0)
3188                                 goto child_fail;
3189
3190                         /* Turn directory into bind mount */
3191                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3192                                 log_error("Failed to make bind mount: %m");
3193                                 goto child_fail;
3194                         }
3195
3196                         r = setup_volatile(arg_directory);
3197                         if (r < 0)
3198                                 goto child_fail;
3199
3200                         if (setup_volatile_state(arg_directory) < 0)
3201                                 goto child_fail;
3202
3203                         r = base_filesystem_create(arg_directory);
3204                         if (r < 0)
3205                                 goto child_fail;
3206
3207                         if (arg_read_only) {
3208                                 k = bind_remount_recursive(arg_directory, true);
3209                                 if (k < 0) {
3210                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3211                                         goto child_fail;
3212                                 }
3213                         }
3214
3215                         if (mount_all(arg_directory) < 0)
3216                                 goto child_fail;
3217
3218                         if (copy_devnodes(arg_directory) < 0)
3219                                 goto child_fail;
3220
3221                         if (setup_ptmx(arg_directory) < 0)
3222                                 goto child_fail;
3223
3224                         dev_setup(arg_directory);
3225
3226                         if (setup_seccomp() < 0)
3227                                 goto child_fail;
3228
3229                         if (setup_dev_console(arg_directory, console) < 0)
3230                                 goto child_fail;
3231
3232                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3233                                 goto child_fail;
3234
3235                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3236
3237                         if (setup_boot_id(arg_directory) < 0)
3238                                 goto child_fail;
3239
3240                         if (setup_timezone(arg_directory) < 0)
3241                                 goto child_fail;
3242
3243                         if (setup_resolv_conf(arg_directory) < 0)
3244                                 goto child_fail;
3245
3246                         if (setup_journal(arg_directory) < 0)
3247                                 goto child_fail;
3248
3249                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3250                                 goto child_fail;
3251
3252                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3253                                 goto child_fail;
3254
3255                         if (mount_tmpfs(arg_directory) < 0)
3256                                 goto child_fail;
3257
3258                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3259                                 goto child_fail;
3260
3261                         /* Tell the parent that we are ready, and that
3262                          * it can cgroupify us to that we lack access
3263                          * to certain devices and resources. */
3264                         r = eventfd_send_state(eventfds[1],
3265                                                EVENTFD_CHILD_SUCCEEDED);
3266                         eventfds[1] = safe_close(eventfds[1]);
3267                         if (r < 0)
3268                                 goto child_fail;
3269
3270                         if (chdir(arg_directory) < 0) {
3271                                 log_error("chdir(%s) failed: %m", arg_directory);
3272                                 goto child_fail;
3273                         }
3274
3275                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3276                                 log_error("mount(MS_MOVE) failed: %m");
3277                                 goto child_fail;
3278                         }
3279
3280                         if (chroot(".") < 0) {
3281                                 log_error("chroot() failed: %m");
3282                                 goto child_fail;
3283                         }
3284
3285                         if (chdir("/") < 0) {
3286                                 log_error("chdir() failed: %m");
3287                                 goto child_fail;
3288                         }
3289
3290                         umask(0022);
3291
3292                         if (arg_private_network)
3293                                 loopback_setup();
3294
3295                         if (drop_capabilities() < 0) {
3296                                 log_error("drop_capabilities() failed: %m");
3297                                 goto child_fail;
3298                         }
3299
3300                         r = change_uid_gid(&home);
3301                         if (r < 0)
3302                                 goto child_fail;
3303
3304                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3305                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3306                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3307                                 log_oom();
3308                                 goto child_fail;
3309                         }
3310
3311                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3312                                 char as_uuid[37];
3313
3314                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3315                                         log_oom();
3316                                         goto child_fail;
3317                                 }
3318                         }
3319
3320                         if (fdset_size(fds) > 0) {
3321                                 k = fdset_cloexec(fds, false);
3322                                 if (k < 0) {
3323                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3324                                         goto child_fail;
3325                                 }
3326
3327                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3328                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3329                                         log_oom();
3330                                         goto child_fail;
3331                                 }
3332                         }
3333
3334                         setup_hostname();
3335
3336                         if (arg_personality != 0xffffffffLU) {
3337                                 if (personality(arg_personality) < 0) {
3338                                         log_error("personality() failed: %m");
3339                                         goto child_fail;
3340                                 }
3341                         } else if (secondary) {
3342                                 if (personality(PER_LINUX32) < 0) {
3343                                         log_error("personality() failed: %m");
3344                                         goto child_fail;
3345                                 }
3346                         }
3347
3348 #ifdef HAVE_SELINUX
3349                         if (arg_selinux_context)
3350                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3351                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3352                                         goto child_fail;
3353                                 }
3354 #endif
3355
3356                         if (!strv_isempty(arg_setenv)) {
3357                                 char **n;
3358
3359                                 n = strv_env_merge(2, envp, arg_setenv);
3360                                 if (!n) {
3361                                         log_oom();
3362                                         goto child_fail;
3363                                 }
3364
3365                                 env_use = n;
3366                         } else
3367                                 env_use = (char**) envp;
3368
3369                         /* Wait until the parent is ready with the setup, too... */
3370                         r = eventfd_parent_succeeded(eventfds[0]);
3371                         eventfds[0] = safe_close(eventfds[0]);
3372                         if (r < 0)
3373                                 goto child_fail;
3374
3375                         if (arg_boot) {
3376                                 char **a;
3377                                 size_t l;
3378
3379                                 /* Automatically search for the init system */
3380
3381                                 l = 1 + argc - optind;
3382                                 a = newa(char*, l + 1);
3383                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3384
3385                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3386                                 execve(a[0], a, env_use);
3387
3388                                 a[0] = (char*) "/lib/systemd/systemd";
3389                                 execve(a[0], a, env_use);
3390
3391                                 a[0] = (char*) "/sbin/init";
3392                                 execve(a[0], a, env_use);
3393                         } else if (argc > optind)
3394                                 execvpe(argv[optind], argv + optind, env_use);
3395                         else {
3396                                 chdir(home ? home : "/root");
3397                                 execle("/bin/bash", "-bash", NULL, env_use);
3398                                 execle("/bin/sh", "-sh", NULL, env_use);
3399                         }
3400
3401                         log_error("execv() failed: %m");
3402
3403                 child_fail:
3404                         /* Tell the parent that the setup failed, so he
3405                          * can clean up resources and terminate. */
3406                         if (eventfds[1] != -1)
3407                                 eventfd_send_state(eventfds[1],
3408                                                    EVENTFD_CHILD_FAILED);
3409                         _exit(EXIT_FAILURE);
3410                 }
3411
3412                 fdset_free(fds);
3413                 fds = NULL;
3414
3415                 /* Wait for the child event:
3416                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3417                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3418                  * it is ready with all it needs to do with priviliges.
3419                  * After we got the notification we can make the process
3420                  * join its cgroup which might limit what it can do */
3421                 r = eventfd_child_succeeded(eventfds[1]);
3422                 eventfds[1] = safe_close(eventfds[1]);
3423
3424                 if (r >= 0) {
3425                         int ifi = 0;
3426
3427                         r = move_network_interfaces(pid);
3428                         if (r < 0)
3429                                 goto finish;
3430
3431                         r = setup_veth(pid, veth_name, &ifi);
3432                         if (r < 0)
3433                                 goto finish;
3434
3435                         r = setup_bridge(veth_name, &ifi);
3436                         if (r < 0)
3437                                 goto finish;
3438
3439                         r = setup_macvlan(pid);
3440                         if (r < 0)
3441                                 goto finish;
3442
3443                         r = register_machine(pid, ifi);
3444                         if (r < 0)
3445                                 goto finish;
3446
3447                         /* Block SIGCHLD here, before notifying child.
3448                          * process_pty() will handle it with the other signals. */
3449                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3450                         if (r < 0)
3451                                 goto finish;
3452
3453                         /* Reset signal to default */
3454                         r = default_signals(SIGCHLD, -1);
3455                         if (r < 0)
3456                                 goto finish;
3457
3458                         /* Notify the child that the parent is ready with all
3459                          * its setup, and that the child can now hand over
3460                          * control to the code to run inside the container. */
3461                         r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
3462                         eventfds[0] = safe_close(eventfds[0]);
3463                         if (r < 0)
3464                                 goto finish;
3465
3466                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3467                         if (k < 0) {
3468                                 r = EXIT_FAILURE;
3469                                 break;
3470                         }
3471
3472                         if (!arg_quiet)
3473                                 putc('\n', stdout);
3474
3475                         /* Kill if it is not dead yet anyway */
3476                         terminate_machine(pid);
3477                 }
3478
3479                 /* Normally redundant, but better safe than sorry */
3480                 kill(pid, SIGKILL);
3481
3482                 r = wait_for_container(pid, &container_status);
3483                 pid = 0;
3484
3485                 if (r < 0) {
3486                         /* We failed to wait for the container, or the
3487                          * container exited abnormally */
3488                         r = EXIT_FAILURE;
3489                         break;
3490                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3491                         /* The container exited with a non-zero
3492                          * status, or with zero status and no reboot
3493                          * was requested. */
3494                         break;
3495
3496                 /* CONTAINER_REBOOTED, loop again */
3497
3498                 if (arg_keep_unit) {
3499                         /* Special handling if we are running as a
3500                          * service: instead of simply restarting the
3501                          * machine we want to restart the entire
3502                          * service, so let's inform systemd about this
3503                          * with the special exit code 133. The service
3504                          * file uses RestartForceExitStatus=133 so
3505                          * that this results in a full nspawn
3506                          * restart. This is necessary since we might
3507                          * have cgroup parameters set we want to have
3508                          * flushed out. */
3509                         r = 133;
3510                         break;
3511                 }
3512         }
3513
3514 finish:
3515         loop_remove(loop_nr, &image_fd);
3516
3517         if (pid > 0)
3518                 kill(pid, SIGKILL);
3519
3520         free(arg_directory);
3521         free(arg_machine);
3522         free(arg_user);
3523         strv_free(arg_setenv);
3524         strv_free(arg_network_interfaces);
3525         strv_free(arg_network_macvlan);
3526         strv_free(arg_bind);
3527         strv_free(arg_bind_ro);
3528         strv_free(arg_tmpfs);
3529
3530         return r;
3531 }