chiark / gitweb /
19fb086e7ab8ef2c9a67278f5329d17f10c80dd8
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91
92 #ifdef HAVE_SECCOMP
93 #include "seccomp-util.h"
94 #endif
95
96 typedef enum ContainerStatus {
97         CONTAINER_TERMINATED,
98         CONTAINER_REBOOTED
99 } ContainerStatus;
100
101 typedef enum LinkJournal {
102         LINK_NO,
103         LINK_AUTO,
104         LINK_HOST,
105         LINK_GUEST
106 } LinkJournal;
107
108 static char *arg_directory = NULL;
109 static char *arg_user = NULL;
110 static sd_id128_t arg_uuid = {};
111 static char *arg_machine = NULL;
112 static const char *arg_selinux_context = NULL;
113 static const char *arg_selinux_apifs_context = NULL;
114 static const char *arg_slice = NULL;
115 static bool arg_private_network = false;
116 static bool arg_read_only = false;
117 static bool arg_boot = false;
118 static LinkJournal arg_link_journal = LINK_AUTO;
119 static uint64_t arg_retain =
120         (1ULL << CAP_CHOWN) |
121         (1ULL << CAP_DAC_OVERRIDE) |
122         (1ULL << CAP_DAC_READ_SEARCH) |
123         (1ULL << CAP_FOWNER) |
124         (1ULL << CAP_FSETID) |
125         (1ULL << CAP_IPC_OWNER) |
126         (1ULL << CAP_KILL) |
127         (1ULL << CAP_LEASE) |
128         (1ULL << CAP_LINUX_IMMUTABLE) |
129         (1ULL << CAP_NET_BIND_SERVICE) |
130         (1ULL << CAP_NET_BROADCAST) |
131         (1ULL << CAP_NET_RAW) |
132         (1ULL << CAP_SETGID) |
133         (1ULL << CAP_SETFCAP) |
134         (1ULL << CAP_SETPCAP) |
135         (1ULL << CAP_SETUID) |
136         (1ULL << CAP_SYS_ADMIN) |
137         (1ULL << CAP_SYS_CHROOT) |
138         (1ULL << CAP_SYS_NICE) |
139         (1ULL << CAP_SYS_PTRACE) |
140         (1ULL << CAP_SYS_TTY_CONFIG) |
141         (1ULL << CAP_SYS_RESOURCE) |
142         (1ULL << CAP_SYS_BOOT) |
143         (1ULL << CAP_AUDIT_WRITE) |
144         (1ULL << CAP_AUDIT_CONTROL) |
145         (1ULL << CAP_MKNOD);
146 static char **arg_bind = NULL;
147 static char **arg_bind_ro = NULL;
148 static char **arg_setenv = NULL;
149 static bool arg_quiet = false;
150 static bool arg_share_system = false;
151 static bool arg_register = true;
152 static bool arg_keep_unit = false;
153 static char **arg_network_interfaces = NULL;
154 static char **arg_network_macvlan = NULL;
155 static bool arg_network_veth = false;
156 static const char *arg_network_bridge = NULL;
157 static unsigned long arg_personality = 0xffffffffLU;
158 static const char *arg_image = NULL;
159
160 static int help(void) {
161
162         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
163                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
164                "  -h --help                 Show this help\n"
165                "     --version              Print version string\n"
166                "  -q --quiet                Do not show status information\n"
167                "  -D --directory=PATH       Root directory for the container\n"
168                "  -i --image=PATH           File system device or image for the container\n"
169                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
170                "  -u --user=USER            Run the command under specified user or uid\n"
171                "  -M --machine=NAME         Set the machine name for the container\n"
172                "     --uuid=UUID            Set a specific machine UUID for the container\n"
173                "  -S --slice=SLICE          Place the container in the specified slice\n"
174                "     --private-network      Disable network in container\n"
175                "     --network-interface=INTERFACE\n"
176                "                            Assign an existing network interface to the\n"
177                "                            container\n"
178                "     --network-macvlan=INTERFACE\n"
179                "                            Create a macvlan network interface based on an\n"
180                "                            existing network interface to the container\n"
181                "     --network-veth         Add a virtual ethernet connection between host\n"
182                "                            and container\n"
183                "     --network-bridge=INTERFACE\n"
184                "                            Add a virtual ethernet connection between host\n"
185                "                            and container and add it to an existing bridge on\n"
186                "                            the host\n"
187                "  -Z --selinux-context=SECLABEL\n"
188                "                            Set the SELinux security context to be used by\n"
189                "                            processes in the container\n"
190                "  -L --selinux-apifs-context=SECLABEL\n"
191                "                            Set the SELinux security context to be used by\n"
192                "                            API/tmpfs file systems in the container\n"
193                "     --capability=CAP       In addition to the default, retain specified\n"
194                "                            capability\n"
195                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
196                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
197                "  -j                        Equivalent to --link-journal=host\n"
198                "     --read-only            Mount the root directory read-only\n"
199                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
200                "                            the container\n"
201                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
202                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
203                "     --share-system         Share system namespaces with host\n"
204                "     --register=BOOLEAN     Register container as machine\n"
205                "     --keep-unit            Do not register a scope for the machine, reuse\n"
206                "                            the service unit nspawn is running in\n",
207                program_invocation_short_name);
208
209         return 0;
210 }
211
212 static int parse_argv(int argc, char *argv[]) {
213
214         enum {
215                 ARG_VERSION = 0x100,
216                 ARG_PRIVATE_NETWORK,
217                 ARG_UUID,
218                 ARG_READ_ONLY,
219                 ARG_CAPABILITY,
220                 ARG_DROP_CAPABILITY,
221                 ARG_LINK_JOURNAL,
222                 ARG_BIND,
223                 ARG_BIND_RO,
224                 ARG_SETENV,
225                 ARG_SHARE_SYSTEM,
226                 ARG_REGISTER,
227                 ARG_KEEP_UNIT,
228                 ARG_NETWORK_INTERFACE,
229                 ARG_NETWORK_MACVLAN,
230                 ARG_NETWORK_VETH,
231                 ARG_NETWORK_BRIDGE,
232                 ARG_PERSONALITY,
233         };
234
235         static const struct option options[] = {
236                 { "help",                  no_argument,       NULL, 'h'                   },
237                 { "version",               no_argument,       NULL, ARG_VERSION           },
238                 { "directory",             required_argument, NULL, 'D'                   },
239                 { "user",                  required_argument, NULL, 'u'                   },
240                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
241                 { "boot",                  no_argument,       NULL, 'b'                   },
242                 { "uuid",                  required_argument, NULL, ARG_UUID              },
243                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
244                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
245                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
246                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
247                 { "bind",                  required_argument, NULL, ARG_BIND              },
248                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
249                 { "machine",               required_argument, NULL, 'M'                   },
250                 { "slice",                 required_argument, NULL, 'S'                   },
251                 { "setenv",                required_argument, NULL, ARG_SETENV            },
252                 { "selinux-context",       required_argument, NULL, 'Z'                   },
253                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
254                 { "quiet",                 no_argument,       NULL, 'q'                   },
255                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
256                 { "register",              required_argument, NULL, ARG_REGISTER          },
257                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
258                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
259                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
260                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
261                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
262                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
263                 { "image",                 required_argument, NULL, 'i'                   },
264                 {}
265         };
266
267         int c, r;
268         uint64_t plus = 0, minus = 0;
269
270         assert(argc >= 0);
271         assert(argv);
272
273         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
274
275                 switch (c) {
276
277                 case 'h':
278                         return help();
279
280                 case ARG_VERSION:
281                         puts(PACKAGE_STRING);
282                         puts(SYSTEMD_FEATURES);
283                         return 0;
284
285                 case 'D':
286                         free(arg_directory);
287                         arg_directory = canonicalize_file_name(optarg);
288                         if (!arg_directory) {
289                                 log_error("Invalid root directory: %m");
290                                 return -ENOMEM;
291                         }
292
293                         break;
294
295                 case 'i':
296                         arg_image = optarg;
297                         break;
298
299                 case 'u':
300                         free(arg_user);
301                         arg_user = strdup(optarg);
302                         if (!arg_user)
303                                 return log_oom();
304
305                         break;
306
307                 case ARG_NETWORK_BRIDGE:
308                         arg_network_bridge = optarg;
309
310                         /* fall through */
311
312                 case ARG_NETWORK_VETH:
313                         arg_network_veth = true;
314                         arg_private_network = true;
315                         break;
316
317                 case ARG_NETWORK_INTERFACE:
318                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
319                                 return log_oom();
320
321                         arg_private_network = true;
322                         break;
323
324                 case ARG_NETWORK_MACVLAN:
325                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
326                                 return log_oom();
327
328                         /* fall through */
329
330                 case ARG_PRIVATE_NETWORK:
331                         arg_private_network = true;
332                         break;
333
334                 case 'b':
335                         arg_boot = true;
336                         break;
337
338                 case ARG_UUID:
339                         r = sd_id128_from_string(optarg, &arg_uuid);
340                         if (r < 0) {
341                                 log_error("Invalid UUID: %s", optarg);
342                                 return r;
343                         }
344                         break;
345
346                 case 'S':
347                         arg_slice = optarg;
348                         break;
349
350                 case 'M':
351                         if (isempty(optarg)) {
352                                 free(arg_machine);
353                                 arg_machine = NULL;
354                         } else {
355
356                                 if (!hostname_is_valid(optarg)) {
357                                         log_error("Invalid machine name: %s", optarg);
358                                         return -EINVAL;
359                                 }
360
361                                 free(arg_machine);
362                                 arg_machine = strdup(optarg);
363                                 if (!arg_machine)
364                                         return log_oom();
365
366                                 break;
367                         }
368
369                 case 'Z':
370                         arg_selinux_context = optarg;
371                         break;
372
373                 case 'L':
374                         arg_selinux_apifs_context = optarg;
375                         break;
376
377                 case ARG_READ_ONLY:
378                         arg_read_only = true;
379                         break;
380
381                 case ARG_CAPABILITY:
382                 case ARG_DROP_CAPABILITY: {
383                         char *state, *word;
384                         size_t length;
385
386                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
387                                 _cleanup_free_ char *t;
388                                 cap_value_t cap;
389
390                                 t = strndup(word, length);
391                                 if (!t)
392                                         return log_oom();
393
394                                 if (streq(t, "all")) {
395                                         if (c == ARG_CAPABILITY)
396                                                 plus = (uint64_t) -1;
397                                         else
398                                                 minus = (uint64_t) -1;
399                                 } else {
400                                         if (cap_from_name(t, &cap) < 0) {
401                                                 log_error("Failed to parse capability %s.", t);
402                                                 return -EINVAL;
403                                         }
404
405                                         if (c == ARG_CAPABILITY)
406                                                 plus |= 1ULL << (uint64_t) cap;
407                                         else
408                                                 minus |= 1ULL << (uint64_t) cap;
409                                 }
410                         }
411
412                         break;
413                 }
414
415                 case 'j':
416                         arg_link_journal = LINK_GUEST;
417                         break;
418
419                 case ARG_LINK_JOURNAL:
420                         if (streq(optarg, "auto"))
421                                 arg_link_journal = LINK_AUTO;
422                         else if (streq(optarg, "no"))
423                                 arg_link_journal = LINK_NO;
424                         else if (streq(optarg, "guest"))
425                                 arg_link_journal = LINK_GUEST;
426                         else if (streq(optarg, "host"))
427                                 arg_link_journal = LINK_HOST;
428                         else {
429                                 log_error("Failed to parse link journal mode %s", optarg);
430                                 return -EINVAL;
431                         }
432
433                         break;
434
435                 case ARG_BIND:
436                 case ARG_BIND_RO: {
437                         _cleanup_free_ char *a = NULL, *b = NULL;
438                         char *e;
439                         char ***x;
440
441                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
442
443                         e = strchr(optarg, ':');
444                         if (e) {
445                                 a = strndup(optarg, e - optarg);
446                                 b = strdup(e + 1);
447                         } else {
448                                 a = strdup(optarg);
449                                 b = strdup(optarg);
450                         }
451
452                         if (!a || !b)
453                                 return log_oom();
454
455                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
456                                 log_error("Invalid bind mount specification: %s", optarg);
457                                 return -EINVAL;
458                         }
459
460                         r = strv_extend(x, a);
461                         if (r < 0)
462                                 return log_oom();
463
464                         r = strv_extend(x, b);
465                         if (r < 0)
466                                 return log_oom();
467
468                         break;
469                 }
470
471                 case ARG_SETENV: {
472                         char **n;
473
474                         if (!env_assignment_is_valid(optarg)) {
475                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
476                                 return -EINVAL;
477                         }
478
479                         n = strv_env_set(arg_setenv, optarg);
480                         if (!n)
481                                 return log_oom();
482
483                         strv_free(arg_setenv);
484                         arg_setenv = n;
485                         break;
486                 }
487
488                 case 'q':
489                         arg_quiet = true;
490                         break;
491
492                 case ARG_SHARE_SYSTEM:
493                         arg_share_system = true;
494                         break;
495
496                 case ARG_REGISTER:
497                         r = parse_boolean(optarg);
498                         if (r < 0) {
499                                 log_error("Failed to parse --register= argument: %s", optarg);
500                                 return r;
501                         }
502
503                         arg_register = r;
504                         break;
505
506                 case ARG_KEEP_UNIT:
507                         arg_keep_unit = true;
508                         break;
509
510                 case ARG_PERSONALITY:
511
512                         arg_personality = personality_from_string(optarg);
513                         if (arg_personality == 0xffffffffLU) {
514                                 log_error("Unknown or unsupported personality '%s'.", optarg);
515                                 return -EINVAL;
516                         }
517
518                         break;
519
520                 case '?':
521                         return -EINVAL;
522
523                 default:
524                         assert_not_reached("Unhandled option");
525                 }
526         }
527
528         if (arg_share_system)
529                 arg_register = false;
530
531         if (arg_boot && arg_share_system) {
532                 log_error("--boot and --share-system may not be combined.");
533                 return -EINVAL;
534         }
535
536         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
537                 log_error("--keep-unit may not be used when invoked from a user session.");
538                 return -EINVAL;
539         }
540
541         if (arg_directory && arg_image) {
542                 log_error("--directory= and --image= may not be combined.");
543                 return -EINVAL;
544         }
545
546         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
547
548         return 1;
549 }
550
551 static int mount_all(const char *dest) {
552
553         typedef struct MountPoint {
554                 const char *what;
555                 const char *where;
556                 const char *type;
557                 const char *options;
558                 unsigned long flags;
559                 bool fatal;
560         } MountPoint;
561
562         static const MountPoint mount_table[] = {
563                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
564                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
565                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
566                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
567                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
568                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
569                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
570                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
571 #ifdef HAVE_SELINUX
572                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
573                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
574 #endif
575         };
576
577         unsigned k;
578         int r = 0;
579
580         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
581                 _cleanup_free_ char *where = NULL;
582 #ifdef HAVE_SELINUX
583                 _cleanup_free_ char *options = NULL;
584 #endif
585                 const char *o;
586                 int t;
587
588                 where = strjoin(dest, "/", mount_table[k].where, NULL);
589                 if (!where)
590                         return log_oom();
591
592                 t = path_is_mount_point(where, true);
593                 if (t < 0) {
594                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
595
596                         if (r == 0)
597                                 r = t;
598
599                         continue;
600                 }
601
602                 /* Skip this entry if it is not a remount. */
603                 if (mount_table[k].what && t > 0)
604                         continue;
605
606                 mkdir_p(where, 0755);
607
608 #ifdef HAVE_SELINUX
609                 if (arg_selinux_apifs_context &&
610                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
611                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
612                         if (!options)
613                                 return log_oom();
614
615                         o = options;
616                 } else
617 #endif
618                         o = mount_table[k].options;
619
620
621                 if (mount(mount_table[k].what,
622                           where,
623                           mount_table[k].type,
624                           mount_table[k].flags,
625                           o) < 0 &&
626                     mount_table[k].fatal) {
627
628                         log_error("mount(%s) failed: %m", where);
629
630                         if (r == 0)
631                                 r = -errno;
632                 }
633         }
634
635         return r;
636 }
637
638 static int mount_binds(const char *dest, char **l, bool ro) {
639         char **x, **y;
640
641         STRV_FOREACH_PAIR(x, y, l) {
642                 char *where;
643                 struct stat source_st, dest_st;
644                 int r;
645
646                 if (stat(*x, &source_st) < 0) {
647                         log_error("Failed to stat %s: %m", *x);
648                         return -errno;
649                 }
650
651                 where = strappenda(dest, *y);
652                 r = stat(where, &dest_st);
653                 if (r == 0) {
654                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
655                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
656                                                 *x, where);
657                                 return -EINVAL;
658                         }
659                 } else if (errno == ENOENT) {
660                         r = mkdir_parents_label(where, 0755);
661                         if (r < 0) {
662                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
663                                 return r;
664                         }
665                 } else {
666                         log_error("Failed to bind mount %s: %m", *x);
667                         return -errno;
668                 }
669                 /* Create the mount point, but be conservative -- refuse to create block
670                 * and char devices. */
671                 if (S_ISDIR(source_st.st_mode))
672                         mkdir_label(where, 0755);
673                 else if (S_ISFIFO(source_st.st_mode))
674                         mkfifo(where, 0644);
675                 else if (S_ISSOCK(source_st.st_mode))
676                         mknod(where, 0644 | S_IFSOCK, 0);
677                 else if (S_ISREG(source_st.st_mode))
678                         touch(where);
679                 else {
680                         log_error("Refusing to create mountpoint for file: %s", *x);
681                         return -ENOTSUP;
682                 }
683
684                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
685                         log_error("mount(%s) failed: %m", where);
686                         return -errno;
687                 }
688
689                 if (ro) {
690                         r = bind_remount_recursive(where, true);
691                         if (r < 0) {
692                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
693                                 return r;
694                         }
695                 }
696         }
697
698         return 0;
699 }
700
701 static int setup_timezone(const char *dest) {
702         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
703         char *z, *y;
704         int r;
705
706         assert(dest);
707
708         /* Fix the timezone, if possible */
709         r = readlink_malloc("/etc/localtime", &p);
710         if (r < 0) {
711                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
712                 return 0;
713         }
714
715         z = path_startswith(p, "../usr/share/zoneinfo/");
716         if (!z)
717                 z = path_startswith(p, "/usr/share/zoneinfo/");
718         if (!z) {
719                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
720                 return 0;
721         }
722
723         where = strappend(dest, "/etc/localtime");
724         if (!where)
725                 return log_oom();
726
727         r = readlink_malloc(where, &q);
728         if (r >= 0) {
729                 y = path_startswith(q, "../usr/share/zoneinfo/");
730                 if (!y)
731                         y = path_startswith(q, "/usr/share/zoneinfo/");
732
733
734                 /* Already pointing to the right place? Then do nothing .. */
735                 if (y && streq(y, z))
736                         return 0;
737         }
738
739         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
740         if (!check)
741                 return log_oom();
742
743         if (access(check, F_OK) < 0) {
744                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
745                 return 0;
746         }
747
748         what = strappend("../usr/share/zoneinfo/", z);
749         if (!what)
750                 return log_oom();
751
752         unlink(where);
753         if (symlink(what, where) < 0) {
754                 log_error("Failed to correct timezone of container: %m");
755                 return 0;
756         }
757
758         return 0;
759 }
760
761 static int setup_resolv_conf(const char *dest) {
762         char _cleanup_free_ *where = NULL;
763
764         assert(dest);
765
766         if (arg_private_network)
767                 return 0;
768
769         /* Fix resolv.conf, if possible */
770         where = strappend(dest, "/etc/resolv.conf");
771         if (!where)
772                 return log_oom();
773
774         /* We don't really care for the results of this really. If it
775          * fails, it fails, but meh... */
776         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
777
778         return 0;
779 }
780
781 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
782
783         snprintf(s, 37,
784                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
785                  SD_ID128_FORMAT_VAL(id));
786
787         return s;
788 }
789
790 static int setup_boot_id(const char *dest) {
791         _cleanup_free_ char *from = NULL, *to = NULL;
792         sd_id128_t rnd = {};
793         char as_uuid[37];
794         int r;
795
796         assert(dest);
797
798         if (arg_share_system)
799                 return 0;
800
801         /* Generate a new randomized boot ID, so that each boot-up of
802          * the container gets a new one */
803
804         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
805         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
806         if (!from || !to)
807                 return log_oom();
808
809         r = sd_id128_randomize(&rnd);
810         if (r < 0) {
811                 log_error("Failed to generate random boot id: %s", strerror(-r));
812                 return r;
813         }
814
815         id128_format_as_uuid(rnd, as_uuid);
816
817         r = write_string_file(from, as_uuid);
818         if (r < 0) {
819                 log_error("Failed to write boot id: %s", strerror(-r));
820                 return r;
821         }
822
823         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
824                 log_error("Failed to bind mount boot id: %m");
825                 r = -errno;
826         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
827                 log_warning("Failed to make boot id read-only: %m");
828
829         unlink(from);
830         return r;
831 }
832
833 static int copy_devnodes(const char *dest) {
834
835         static const char devnodes[] =
836                 "null\0"
837                 "zero\0"
838                 "full\0"
839                 "random\0"
840                 "urandom\0"
841                 "tty\0";
842
843         const char *d;
844         int r = 0;
845         _cleanup_umask_ mode_t u;
846
847         assert(dest);
848
849         u = umask(0000);
850
851         NULSTR_FOREACH(d, devnodes) {
852                 _cleanup_free_ char *from = NULL, *to = NULL;
853                 struct stat st;
854
855                 from = strappend("/dev/", d);
856                 to = strjoin(dest, "/dev/", d, NULL);
857                 if (!from || !to)
858                         return log_oom();
859
860                 if (stat(from, &st) < 0) {
861
862                         if (errno != ENOENT) {
863                                 log_error("Failed to stat %s: %m", from);
864                                 return -errno;
865                         }
866
867                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
868
869                         log_error("%s is not a char or block device, cannot copy", from);
870                         return -EIO;
871
872                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
873
874                         log_error("mknod(%s) failed: %m", dest);
875                         return  -errno;
876                 }
877         }
878
879         return r;
880 }
881
882 static int setup_ptmx(const char *dest) {
883         _cleanup_free_ char *p = NULL;
884
885         p = strappend(dest, "/dev/ptmx");
886         if (!p)
887                 return log_oom();
888
889         if (symlink("pts/ptmx", p) < 0) {
890                 log_error("Failed to create /dev/ptmx symlink: %m");
891                 return -errno;
892         }
893
894         return 0;
895 }
896
897 static int setup_dev_console(const char *dest, const char *console) {
898         _cleanup_umask_ mode_t u;
899         const char *to;
900         struct stat st;
901         int r;
902
903         assert(dest);
904         assert(console);
905
906         u = umask(0000);
907
908         if (stat("/dev/null", &st) < 0) {
909                 log_error("Failed to stat /dev/null: %m");
910                 return -errno;
911         }
912
913         r = chmod_and_chown(console, 0600, 0, 0);
914         if (r < 0) {
915                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
916                 return r;
917         }
918
919         /* We need to bind mount the right tty to /dev/console since
920          * ptys can only exist on pts file systems. To have something
921          * to bind mount things on we create a device node first, and
922          * use /dev/null for that since we the cgroups device policy
923          * allows us to create that freely, while we cannot create
924          * /dev/console. (Note that the major minor doesn't actually
925          * matter here, since we mount it over anyway). */
926
927         to = strappenda(dest, "/dev/console");
928         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
929                 log_error("mknod() for /dev/console failed: %m");
930                 return -errno;
931         }
932
933         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
934                 log_error("Bind mount for /dev/console failed: %m");
935                 return -errno;
936         }
937
938         return 0;
939 }
940
941 static int setup_kmsg(const char *dest, int kmsg_socket) {
942         _cleanup_free_ char *from = NULL, *to = NULL;
943         int r, fd, k;
944         _cleanup_umask_ mode_t u;
945         union {
946                 struct cmsghdr cmsghdr;
947                 uint8_t buf[CMSG_SPACE(sizeof(int))];
948         } control = {};
949         struct msghdr mh = {
950                 .msg_control = &control,
951                 .msg_controllen = sizeof(control),
952         };
953         struct cmsghdr *cmsg;
954
955         assert(dest);
956         assert(kmsg_socket >= 0);
957
958         u = umask(0000);
959
960         /* We create the kmsg FIFO as /dev/kmsg, but immediately
961          * delete it after bind mounting it to /proc/kmsg. While FIFOs
962          * on the reading side behave very similar to /proc/kmsg,
963          * their writing side behaves differently from /dev/kmsg in
964          * that writing blocks when nothing is reading. In order to
965          * avoid any problems with containers deadlocking due to this
966          * we simply make /dev/kmsg unavailable to the container. */
967         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
968             asprintf(&to, "%s/proc/kmsg", dest) < 0)
969                 return log_oom();
970
971         if (mkfifo(from, 0600) < 0) {
972                 log_error("mkfifo() for /dev/kmsg failed: %m");
973                 return -errno;
974         }
975
976         r = chmod_and_chown(from, 0600, 0, 0);
977         if (r < 0) {
978                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
979                 return r;
980         }
981
982         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
983                 log_error("Bind mount for /proc/kmsg failed: %m");
984                 return -errno;
985         }
986
987         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
988         if (fd < 0) {
989                 log_error("Failed to open fifo: %m");
990                 return -errno;
991         }
992
993         cmsg = CMSG_FIRSTHDR(&mh);
994         cmsg->cmsg_level = SOL_SOCKET;
995         cmsg->cmsg_type = SCM_RIGHTS;
996         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
997         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
998
999         mh.msg_controllen = cmsg->cmsg_len;
1000
1001         /* Store away the fd in the socket, so that it stays open as
1002          * long as we run the child */
1003         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1004         safe_close(fd);
1005
1006         if (k < 0) {
1007                 log_error("Failed to send FIFO fd: %m");
1008                 return -errno;
1009         }
1010
1011         /* And now make the FIFO unavailable as /dev/kmsg... */
1012         unlink(from);
1013         return 0;
1014 }
1015
1016 static int setup_hostname(void) {
1017
1018         if (arg_share_system)
1019                 return 0;
1020
1021         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1022                 return -errno;
1023
1024         return 0;
1025 }
1026
1027 static int setup_journal(const char *directory) {
1028         sd_id128_t machine_id, this_id;
1029         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1030         char *id;
1031         int r;
1032
1033         p = strappend(directory, "/etc/machine-id");
1034         if (!p)
1035                 return log_oom();
1036
1037         r = read_one_line_file(p, &b);
1038         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1039                 return 0;
1040         else if (r < 0) {
1041                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1042                 return r;
1043         }
1044
1045         id = strstrip(b);
1046         if (isempty(id) && arg_link_journal == LINK_AUTO)
1047                 return 0;
1048
1049         /* Verify validity */
1050         r = sd_id128_from_string(id, &machine_id);
1051         if (r < 0) {
1052                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1053                 return r;
1054         }
1055
1056         r = sd_id128_get_machine(&this_id);
1057         if (r < 0) {
1058                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1059                 return r;
1060         }
1061
1062         if (sd_id128_equal(machine_id, this_id)) {
1063                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1064                          "Host and machine ids are equal (%s): refusing to link journals", id);
1065                 if (arg_link_journal == LINK_AUTO)
1066                         return 0;
1067                 return
1068                         -EEXIST;
1069         }
1070
1071         if (arg_link_journal == LINK_NO)
1072                 return 0;
1073
1074         free(p);
1075         p = strappend("/var/log/journal/", id);
1076         q = strjoin(directory, "/var/log/journal/", id, NULL);
1077         if (!p || !q)
1078                 return log_oom();
1079
1080         if (path_is_mount_point(p, false) > 0) {
1081                 if (arg_link_journal != LINK_AUTO) {
1082                         log_error("%s: already a mount point, refusing to use for journal", p);
1083                         return -EEXIST;
1084                 }
1085
1086                 return 0;
1087         }
1088
1089         if (path_is_mount_point(q, false) > 0) {
1090                 if (arg_link_journal != LINK_AUTO) {
1091                         log_error("%s: already a mount point, refusing to use for journal", q);
1092                         return -EEXIST;
1093                 }
1094
1095                 return 0;
1096         }
1097
1098         r = readlink_and_make_absolute(p, &d);
1099         if (r >= 0) {
1100                 if ((arg_link_journal == LINK_GUEST ||
1101                      arg_link_journal == LINK_AUTO) &&
1102                     path_equal(d, q)) {
1103
1104                         r = mkdir_p(q, 0755);
1105                         if (r < 0)
1106                                 log_warning("failed to create directory %s: %m", q);
1107                         return 0;
1108                 }
1109
1110                 if (unlink(p) < 0) {
1111                         log_error("Failed to remove symlink %s: %m", p);
1112                         return -errno;
1113                 }
1114         } else if (r == -EINVAL) {
1115
1116                 if (arg_link_journal == LINK_GUEST &&
1117                     rmdir(p) < 0) {
1118
1119                         if (errno == ENOTDIR) {
1120                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1121                                 return r;
1122                         } else {
1123                                 log_error("Failed to remove %s: %m", p);
1124                                 return -errno;
1125                         }
1126                 }
1127         } else if (r != -ENOENT) {
1128                 log_error("readlink(%s) failed: %m", p);
1129                 return r;
1130         }
1131
1132         if (arg_link_journal == LINK_GUEST) {
1133
1134                 if (symlink(q, p) < 0) {
1135                         log_error("Failed to symlink %s to %s: %m", q, p);
1136                         return -errno;
1137                 }
1138
1139                 r = mkdir_p(q, 0755);
1140                 if (r < 0)
1141                         log_warning("failed to create directory %s: %m", q);
1142                 return 0;
1143         }
1144
1145         if (arg_link_journal == LINK_HOST) {
1146                 r = mkdir_p(p, 0755);
1147                 if (r < 0) {
1148                         log_error("Failed to create %s: %m", p);
1149                         return r;
1150                 }
1151
1152         } else if (access(p, F_OK) < 0)
1153                 return 0;
1154
1155         if (dir_is_empty(q) == 0)
1156                 log_warning("%s is not empty, proceeding anyway.", q);
1157
1158         r = mkdir_p(q, 0755);
1159         if (r < 0) {
1160                 log_error("Failed to create %s: %m", q);
1161                 return r;
1162         }
1163
1164         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1165                 log_error("Failed to bind mount journal from host into guest: %m");
1166                 return -errno;
1167         }
1168
1169         return 0;
1170 }
1171
1172 static int setup_kdbus(const char *dest, const char *path) {
1173         const char *p;
1174
1175         if (!path)
1176                 return 0;
1177
1178         p = strappenda(dest, "/dev/kdbus");
1179         if (mkdir(p, 0755) < 0) {
1180                 log_error("Failed to create kdbus path: %m");
1181                 return  -errno;
1182         }
1183
1184         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1185                 log_error("Failed to mount kdbus domain path: %m");
1186                 return -errno;
1187         }
1188
1189         return 0;
1190 }
1191
1192 static int drop_capabilities(void) {
1193         return capability_bounding_set_drop(~arg_retain, false);
1194 }
1195
1196 static int register_machine(pid_t pid) {
1197         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1198         _cleanup_bus_unref_ sd_bus *bus = NULL;
1199         int r;
1200
1201         if (!arg_register)
1202                 return 0;
1203
1204         r = sd_bus_default_system(&bus);
1205         if (r < 0) {
1206                 log_error("Failed to open system bus: %s", strerror(-r));
1207                 return r;
1208         }
1209
1210         if (arg_keep_unit) {
1211                 r = sd_bus_call_method(
1212                                 bus,
1213                                 "org.freedesktop.machine1",
1214                                 "/org/freedesktop/machine1",
1215                                 "org.freedesktop.machine1.Manager",
1216                                 "RegisterMachine",
1217                                 &error,
1218                                 NULL,
1219                                 "sayssus",
1220                                 arg_machine,
1221                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1222                                 "nspawn",
1223                                 "container",
1224                                 (uint32_t) pid,
1225                                 strempty(arg_directory));
1226         } else {
1227                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1228
1229                 r = sd_bus_message_new_method_call(
1230                                 bus,
1231                                 &m,
1232                                 "org.freedesktop.machine1",
1233                                 "/org/freedesktop/machine1",
1234                                 "org.freedesktop.machine1.Manager",
1235                                 "CreateMachine");
1236                 if (r < 0) {
1237                         log_error("Failed to create message: %s", strerror(-r));
1238                         return r;
1239                 }
1240
1241                 r = sd_bus_message_append(
1242                                 m,
1243                                 "sayssus",
1244                                 arg_machine,
1245                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1246                                 "nspawn",
1247                                 "container",
1248                                 (uint32_t) pid,
1249                                 strempty(arg_directory));
1250                 if (r < 0) {
1251                         log_error("Failed to append message arguments: %s", strerror(-r));
1252                         return r;
1253                 }
1254
1255                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1256                 if (r < 0) {
1257                         log_error("Failed to open container: %s", strerror(-r));
1258                         return r;
1259                 }
1260
1261                 if (!isempty(arg_slice)) {
1262                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1263                         if (r < 0) {
1264                                 log_error("Failed to append slice: %s", strerror(-r));
1265                                 return r;
1266                         }
1267                 }
1268
1269                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1270                 if (r < 0) {
1271                         log_error("Failed to add device policy: %s", strerror(-r));
1272                         return r;
1273                 }
1274
1275                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1276                                           /* Allow the container to
1277                                            * access and create the API
1278                                            * device nodes, so that
1279                                            * PrivateDevices= in the
1280                                            * container can work
1281                                            * fine */
1282                                           "/dev/null", "rwm",
1283                                           "/dev/zero", "rwm",
1284                                           "/dev/full", "rwm",
1285                                           "/dev/random", "rwm",
1286                                           "/dev/urandom", "rwm",
1287                                           "/dev/tty", "rwm",
1288                                           /* Allow the container
1289                                            * access to ptys. However,
1290                                            * do not permit the
1291                                            * container to ever create
1292                                            * these device nodes. */
1293                                           "/dev/pts/ptmx", "rw",
1294                                           "char-pts", "rw",
1295                                           /* Allow the container
1296                                            * access to all kdbus
1297                                            * devices. Again, the
1298                                            * container cannot create
1299                                            * these nodes, only use
1300                                            * them. We use a pretty
1301                                            * open match here, so that
1302                                            * the kernel API can still
1303                                            * change. */
1304                                           "char-kdbus", "rw",
1305                                           "char-kdbus/*", "rw");
1306                 if (r < 0) {
1307                         log_error("Failed to add device whitelist: %s", strerror(-r));
1308                         return r;
1309                 }
1310
1311                 r = sd_bus_message_close_container(m);
1312                 if (r < 0) {
1313                         log_error("Failed to close container: %s", strerror(-r));
1314                         return r;
1315                 }
1316
1317                 r = sd_bus_call(bus, m, 0, &error, NULL);
1318         }
1319
1320         if (r < 0) {
1321                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1322                 return r;
1323         }
1324
1325         return 0;
1326 }
1327
1328 static int terminate_machine(pid_t pid) {
1329         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1330         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1331         _cleanup_bus_unref_ sd_bus *bus = NULL;
1332         const char *path;
1333         int r;
1334
1335         if (!arg_register)
1336                 return 0;
1337
1338         r = sd_bus_default_system(&bus);
1339         if (r < 0) {
1340                 log_error("Failed to open system bus: %s", strerror(-r));
1341                 return r;
1342         }
1343
1344         r = sd_bus_call_method(
1345                         bus,
1346                         "org.freedesktop.machine1",
1347                         "/org/freedesktop/machine1",
1348                         "org.freedesktop.machine1.Manager",
1349                         "GetMachineByPID",
1350                         &error,
1351                         &reply,
1352                         "u",
1353                         (uint32_t) pid);
1354         if (r < 0) {
1355                 /* Note that the machine might already have been
1356                  * cleaned up automatically, hence don't consider it a
1357                  * failure if we cannot get the machine object. */
1358                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1359                 return 0;
1360         }
1361
1362         r = sd_bus_message_read(reply, "o", &path);
1363         if (r < 0)
1364                 return bus_log_parse_error(r);
1365
1366         r = sd_bus_call_method(
1367                         bus,
1368                         "org.freedesktop.machine1",
1369                         path,
1370                         "org.freedesktop.machine1.Machine",
1371                         "Terminate",
1372                         &error,
1373                         NULL,
1374                         NULL);
1375         if (r < 0) {
1376                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1377                 return 0;
1378         }
1379
1380         return 0;
1381 }
1382
1383 static int reset_audit_loginuid(void) {
1384         _cleanup_free_ char *p = NULL;
1385         int r;
1386
1387         if (arg_share_system)
1388                 return 0;
1389
1390         r = read_one_line_file("/proc/self/loginuid", &p);
1391         if (r == -ENOENT)
1392                 return 0;
1393         if (r < 0) {
1394                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1395                 return r;
1396         }
1397
1398         /* Already reset? */
1399         if (streq(p, "4294967295"))
1400                 return 0;
1401
1402         r = write_string_file("/proc/self/loginuid", "4294967295");
1403         if (r < 0) {
1404                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1405                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1406                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1407                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1408                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1409
1410                 sleep(5);
1411         }
1412
1413         return 0;
1414 }
1415
1416 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1417
1418 static int get_mac(struct ether_addr *mac) {
1419         int r;
1420
1421         uint8_t result[8];
1422         size_t l, sz;
1423         uint8_t *v;
1424
1425         l = strlen(arg_machine);
1426         sz = sizeof(sd_id128_t) + l;
1427         v = alloca(sz);
1428
1429         /* fetch some persistent data unique to the host */
1430         r = sd_id128_get_machine((sd_id128_t*) v);
1431         if (r < 0)
1432                 return r;
1433
1434         /* combine with some data unique (on this host) to this
1435          * container instance */
1436         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1437
1438         /* Let's hash the host machine ID plus the container name. We
1439          * use a fixed, but originally randomly created hash key here. */
1440         siphash24(result, v, sz, HASH_KEY.bytes);
1441
1442         assert_cc(ETH_ALEN <= sizeof(result));
1443         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1444
1445         /* see eth_random_addr in the kernel */
1446         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1447         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1448
1449         return 0;
1450 }
1451
1452 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1453         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1454         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1455         struct ether_addr mac;
1456         int r;
1457
1458         if (!arg_private_network)
1459                 return 0;
1460
1461         if (!arg_network_veth)
1462                 return 0;
1463
1464         /* Use two different interface name prefixes depending whether
1465          * we are in bridge mode or not. */
1466         if (arg_network_bridge)
1467                 memcpy(iface_name, "vb-", 3);
1468         else
1469                 memcpy(iface_name, "ve-", 3);
1470         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1471
1472         r = get_mac(&mac);
1473         if (r < 0) {
1474                 log_error("Failed to generate predictable MAC address for host0");
1475                 return r;
1476         }
1477
1478         r = sd_rtnl_open(&rtnl, 0);
1479         if (r < 0) {
1480                 log_error("Failed to connect to netlink: %s", strerror(-r));
1481                 return r;
1482         }
1483
1484         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1485         if (r < 0) {
1486                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1487                 return r;
1488         }
1489
1490         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1491         if (r < 0) {
1492                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1493                 return r;
1494         }
1495
1496         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1497         if (r < 0) {
1498                 log_error("Failed to open netlink container: %s", strerror(-r));
1499                 return r;
1500         }
1501
1502         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1503         if (r < 0) {
1504                 log_error("Failed to open netlink container: %s", strerror(-r));
1505                 return r;
1506         }
1507
1508         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1509         if (r < 0) {
1510                 log_error("Failed to open netlink container: %s", strerror(-r));
1511                 return r;
1512         }
1513
1514         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1515         if (r < 0) {
1516                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1517                 return r;
1518         }
1519
1520         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1521         if (r < 0) {
1522                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1523                 return r;
1524         }
1525
1526         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1527         if (r < 0) {
1528                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1529                 return r;
1530         }
1531
1532         r = sd_rtnl_message_close_container(m);
1533         if (r < 0) {
1534                 log_error("Failed to close netlink container: %s", strerror(-r));
1535                 return r;
1536         }
1537
1538         r = sd_rtnl_message_close_container(m);
1539         if (r < 0) {
1540                 log_error("Failed to close netlink container: %s", strerror(-r));
1541                 return r;
1542         }
1543
1544         r = sd_rtnl_message_close_container(m);
1545         if (r < 0) {
1546                 log_error("Failed to close netlink container: %s", strerror(-r));
1547                 return r;
1548         }
1549
1550         r = sd_rtnl_call(rtnl, m, 0, NULL);
1551         if (r < 0) {
1552                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1553                 return r;
1554         }
1555
1556         return 0;
1557 }
1558
1559 static int setup_bridge(const char veth_name[]) {
1560         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1561         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1562         int r, bridge;
1563
1564         if (!arg_private_network)
1565                 return 0;
1566
1567         if (!arg_network_veth)
1568                 return 0;
1569
1570         if (!arg_network_bridge)
1571                 return 0;
1572
1573         bridge = (int) if_nametoindex(arg_network_bridge);
1574         if (bridge <= 0) {
1575                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1576                 return -errno;
1577         }
1578
1579         r = sd_rtnl_open(&rtnl, 0);
1580         if (r < 0) {
1581                 log_error("Failed to connect to netlink: %s", strerror(-r));
1582                 return r;
1583         }
1584
1585         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1586         if (r < 0) {
1587                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1588                 return r;
1589         }
1590
1591         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1592         if (r < 0) {
1593                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1594                 return r;
1595         }
1596
1597         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1598         if (r < 0) {
1599                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1600                 return r;
1601         }
1602
1603         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1604         if (r < 0) {
1605                 log_error("Failed to add netlink master field: %s", strerror(-r));
1606                 return r;
1607         }
1608
1609         r = sd_rtnl_call(rtnl, m, 0, NULL);
1610         if (r < 0) {
1611                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1612                 return r;
1613         }
1614
1615         return 0;
1616 }
1617
1618 static int parse_interface(struct udev *udev, const char *name) {
1619         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1620         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1621         int ifi;
1622
1623         ifi = (int) if_nametoindex(name);
1624         if (ifi <= 0) {
1625                 log_error("Failed to resolve interface %s: %m", name);
1626                 return -errno;
1627         }
1628
1629         sprintf(ifi_str, "n%i", ifi);
1630         d = udev_device_new_from_device_id(udev, ifi_str);
1631         if (!d) {
1632                 log_error("Failed to get udev device for interface %s: %m", name);
1633                 return -errno;
1634         }
1635
1636         if (udev_device_get_is_initialized(d) <= 0) {
1637                 log_error("Network interface %s is not initialized yet.", name);
1638                 return -EBUSY;
1639         }
1640
1641         return ifi;
1642 }
1643
1644 static int move_network_interfaces(pid_t pid) {
1645         _cleanup_udev_unref_ struct udev *udev = NULL;
1646         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1647         char **i;
1648         int r;
1649
1650         if (!arg_private_network)
1651                 return 0;
1652
1653         if (strv_isempty(arg_network_interfaces))
1654                 return 0;
1655
1656         r = sd_rtnl_open(&rtnl, 0);
1657         if (r < 0) {
1658                 log_error("Failed to connect to netlink: %s", strerror(-r));
1659                 return r;
1660         }
1661
1662         udev = udev_new();
1663         if (!udev) {
1664                 log_error("Failed to connect to udev.");
1665                 return -ENOMEM;
1666         }
1667
1668         STRV_FOREACH(i, arg_network_interfaces) {
1669                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1670                 int ifi;
1671
1672                 ifi = parse_interface(udev, *i);
1673                 if (ifi < 0)
1674                         return ifi;
1675
1676                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1677                 if (r < 0) {
1678                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1679                         return r;
1680                 }
1681
1682                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1683                 if (r < 0) {
1684                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1685                         return r;
1686                 }
1687
1688                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1689                 if (r < 0) {
1690                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1691                         return r;
1692                 }
1693         }
1694
1695         return 0;
1696 }
1697
1698 static int setup_macvlan(pid_t pid) {
1699         _cleanup_udev_unref_ struct udev *udev = NULL;
1700         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1701         char **i;
1702         int r;
1703
1704         if (!arg_private_network)
1705                 return 0;
1706
1707         if (strv_isempty(arg_network_macvlan))
1708                 return 0;
1709
1710         r = sd_rtnl_open(&rtnl, 0);
1711         if (r < 0) {
1712                 log_error("Failed to connect to netlink: %s", strerror(-r));
1713                 return r;
1714         }
1715
1716         udev = udev_new();
1717         if (!udev) {
1718                 log_error("Failed to connect to udev.");
1719                 return -ENOMEM;
1720         }
1721
1722         STRV_FOREACH(i, arg_network_macvlan) {
1723                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1724                 _cleanup_free_ char *n = NULL;
1725                 int ifi;
1726
1727                 ifi = parse_interface(udev, *i);
1728                 if (ifi < 0)
1729                         return ifi;
1730
1731                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1732                 if (r < 0) {
1733                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1734                         return r;
1735                 }
1736
1737                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1738                 if (r < 0) {
1739                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1740                         return r;
1741                 }
1742
1743                 n = strappend("mv-", *i);
1744                 if (!n)
1745                         return log_oom();
1746
1747                 strshorten(n, IFNAMSIZ-1);
1748
1749                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1750                 if (r < 0) {
1751                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1752                         return r;
1753                 }
1754
1755                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1756                 if (r < 0) {
1757                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1758                         return r;
1759                 }
1760
1761                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1762                 if (r < 0) {
1763                         log_error("Failed to open netlink container: %s", strerror(-r));
1764                         return r;
1765                 }
1766
1767                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1768                 if (r < 0) {
1769                         log_error("Failed to open netlink container: %s", strerror(-r));
1770                         return r;
1771                 }
1772
1773                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1774                 if (r < 0) {
1775                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1776                         return r;
1777                 }
1778
1779                 r = sd_rtnl_message_close_container(m);
1780                 if (r < 0) {
1781                         log_error("Failed to close netlink container: %s", strerror(-r));
1782                         return r;
1783                 }
1784
1785                 r = sd_rtnl_message_close_container(m);
1786                 if (r < 0) {
1787                         log_error("Failed to close netlink container: %s", strerror(-r));
1788                         return r;
1789                 }
1790
1791                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1792                 if (r < 0) {
1793                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1794                         return r;
1795                 }
1796         }
1797
1798         return 0;
1799 }
1800
1801 static int audit_still_doesnt_work_in_containers(void) {
1802
1803 #ifdef HAVE_SECCOMP
1804         scmp_filter_ctx seccomp;
1805         int r;
1806
1807         /*
1808            Audit is broken in containers, much of the userspace audit
1809            hookup will fail if running inside a container. We don't
1810            care and just turn off creation of audit sockets.
1811
1812            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1813            with EAFNOSUPPORT which audit userspace uses as indication
1814            that audit is disabled in the kernel.
1815          */
1816
1817         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1818         if (!seccomp)
1819                 return log_oom();
1820
1821         r = seccomp_add_secondary_archs(seccomp);
1822         if (r < 0) {
1823                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1824                 goto finish;
1825         }
1826
1827         r = seccomp_rule_add(
1828                         seccomp,
1829                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1830                         SCMP_SYS(socket),
1831                         2,
1832                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1833                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1834         if (r < 0) {
1835                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1836                 goto finish;
1837         }
1838
1839         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1840         if (r < 0) {
1841                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1842                 goto finish;
1843         }
1844
1845         r = seccomp_load(seccomp);
1846         if (r < 0)
1847                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1848
1849 finish:
1850         seccomp_release(seccomp);
1851         return r;
1852 #else
1853         return 0;
1854 #endif
1855
1856 }
1857
1858 static int setup_image(char **device_path, int *loop_nr) {
1859         struct loop_info64 info = {
1860                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1861         };
1862         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1863         _cleanup_free_ char* loopdev = NULL;
1864         struct stat st;
1865         int r, nr;
1866
1867         assert(device_path);
1868         assert(loop_nr);
1869
1870         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1871         if (fd < 0) {
1872                 log_error("Failed to open %s: %m", arg_image);
1873                 return -errno;
1874         }
1875
1876         if (fstat(fd, &st) < 0) {
1877                 log_error("Failed to stat %s: %m", arg_image);
1878                 return -errno;
1879         }
1880
1881         if (S_ISBLK(st.st_mode)) {
1882                 char *p;
1883
1884                 p = strdup(arg_image);
1885                 if (!p)
1886                         return log_oom();
1887
1888                 *device_path = p;
1889
1890                 *loop_nr = -1;
1891
1892                 r = fd;
1893                 fd = -1;
1894
1895                 return r;
1896         }
1897
1898         if (!S_ISREG(st.st_mode)) {
1899                 log_error("%s is not a regular file or block device: %m", arg_image);
1900                 return -EINVAL;
1901         }
1902
1903         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1904         if (control < 0) {
1905                 log_error("Failed to open /dev/loop-control: %m");
1906                 return -errno;
1907         }
1908
1909         nr = ioctl(control, LOOP_CTL_GET_FREE);
1910         if (nr < 0) {
1911                 log_error("Failed to allocate loop device: %m");
1912                 return -errno;
1913         }
1914
1915         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1916                 return log_oom();
1917
1918         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1919         if (loop < 0) {
1920                 log_error("Failed to open loop device %s: %m", loopdev);
1921                 return -errno;
1922         }
1923
1924         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1925                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1926                 return -errno;
1927         }
1928
1929         if (arg_read_only)
1930                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1931
1932         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1933                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1934                 return -errno;
1935         }
1936
1937         *device_path = loopdev;
1938         loopdev = NULL;
1939
1940         *loop_nr = nr;
1941
1942         r = loop;
1943         loop = -1;
1944
1945         return r;
1946 }
1947
1948 static int dissect_image(
1949                 int fd,
1950                 char **root_device, bool *root_device_rw,
1951                 char **home_device, bool *home_device_rw,
1952                 char **srv_device, bool *srv_device_rw,
1953                 bool *secondary) {
1954
1955 #ifdef HAVE_BLKID
1956         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1957         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1958         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1959         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1960         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1961         _cleanup_udev_unref_ struct udev *udev = NULL;
1962         struct udev_list_entry *first, *item;
1963         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
1964         const char *pttype = NULL;
1965         blkid_partlist pl;
1966         struct stat st;
1967         int r;
1968
1969         assert(fd >= 0);
1970         assert(root_device);
1971         assert(home_device);
1972         assert(srv_device);
1973         assert(secondary);
1974
1975         b = blkid_new_probe();
1976         if (!b)
1977                 return log_oom();
1978
1979         errno = 0;
1980         r = blkid_probe_set_device(b, fd, 0, 0);
1981         if (r != 0) {
1982                 if (errno == 0)
1983                         return log_oom();
1984
1985                 log_error("Failed to set device on blkid probe: %m");
1986                 return -errno;
1987         }
1988
1989         blkid_probe_enable_partitions(b, 1);
1990         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1991
1992         errno = 0;
1993         r = blkid_do_safeprobe(b);
1994         if (r == -2 || r == 1) {
1995                 log_error("Failed to identify any partition table on %s.\n"
1996                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1997                 return -EINVAL;
1998         } else if (r != 0) {
1999                 if (errno == 0)
2000                         errno = EIO;
2001                 log_error("Failed to probe: %m");
2002                 return -errno;
2003         }
2004
2005         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2006         if (!streq_ptr(pttype, "gpt")) {
2007                 log_error("Image %s does not carry a GUID Partition Table.\n"
2008                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2009                 return -EINVAL;
2010         }
2011
2012         errno = 0;
2013         pl = blkid_probe_get_partitions(b);
2014         if (!pl) {
2015                 if (errno == 0)
2016                         return log_oom();
2017
2018                 log_error("Failed to list partitions of %s", arg_image);
2019                 return -errno;
2020         }
2021
2022         udev = udev_new();
2023         if (!udev)
2024                 return log_oom();
2025
2026         if (fstat(fd, &st) < 0) {
2027                 log_error("Failed to stat block device: %m");
2028                 return -errno;
2029         }
2030
2031         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2032         if (!d)
2033                 return log_oom();
2034
2035         e = udev_enumerate_new(udev);
2036         if (!e)
2037                 return log_oom();
2038
2039         r = udev_enumerate_add_match_parent(e, d);
2040         if (r < 0)
2041                 return log_oom();
2042
2043         r = udev_enumerate_scan_devices(e);
2044         if (r < 0) {
2045                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2046                 return r;
2047         }
2048
2049         first = udev_enumerate_get_list_entry(e);
2050         udev_list_entry_foreach(item, first) {
2051                 _cleanup_udev_device_unref_ struct udev_device *q;
2052                 const char *stype, *node;
2053                 unsigned long long flags;
2054                 sd_id128_t type_id;
2055                 blkid_partition pp;
2056                 dev_t qn;
2057                 int nr;
2058
2059                 errno = 0;
2060                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2061                 if (!q) {
2062                         if (!errno)
2063                                 errno = ENOMEM;
2064
2065                         log_error("Failed to get partition device of %s: %m", arg_image);
2066                         return -errno;
2067                 }
2068
2069                 qn = udev_device_get_devnum(q);
2070                 if (major(qn) == 0)
2071                         continue;
2072
2073                 if (st.st_rdev == qn)
2074                         continue;
2075
2076                 node = udev_device_get_devnode(q);
2077                 if (!node)
2078                         continue;
2079
2080                 pp = blkid_partlist_devno_to_partition(pl, qn);
2081                 if (!pp)
2082                         continue;
2083
2084                 flags = blkid_partition_get_flags(pp);
2085                 if (flags & GPT_FLAG_NO_AUTO)
2086                         continue;
2087
2088                 nr = blkid_partition_get_partno(pp);
2089                 if (nr < 0)
2090                         continue;
2091
2092                 stype = blkid_partition_get_type_string(pp);
2093                 if (!stype)
2094                         continue;
2095
2096                 if (sd_id128_from_string(stype, &type_id) < 0)
2097                         continue;
2098
2099                 if (sd_id128_equal(type_id, GPT_HOME)) {
2100
2101                         if (home && nr >= home_nr)
2102                                 continue;
2103
2104                         home_nr = nr;
2105                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2106
2107                         free(home);
2108                         home = strdup(node);
2109                         if (!home)
2110                                 return log_oom();
2111                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2112
2113                         if (srv && nr >= srv_nr)
2114                                 continue;
2115
2116                         srv_nr = nr;
2117                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2118
2119                         free(srv);
2120                         srv = strdup(node);
2121                         if (!srv)
2122                                 return log_oom();
2123                 }
2124 #ifdef GPT_ROOT_NATIVE
2125                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2126
2127                         if (root && nr >= root_nr)
2128                                 continue;
2129
2130                         root_nr = nr;
2131                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2132
2133                         free(root);
2134                         root = strdup(node);
2135                         if (!root)
2136                                 return log_oom();
2137                 }
2138 #endif
2139 #ifdef GPT_ROOT_SECONDARY
2140                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2141
2142                         if (secondary_root && nr >= secondary_root_nr)
2143                                 continue;
2144
2145                         secondary_root_nr = nr;
2146                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2147
2148
2149                         free(secondary_root);
2150                         secondary_root = strdup(node);
2151                         if (!secondary_root)
2152                                 return log_oom();
2153                 }
2154 #endif
2155         }
2156
2157         if (!root && !secondary_root) {
2158                 log_error("Failed to identify root partition in disk image %s.\n"
2159                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2160                 return -EINVAL;
2161         }
2162
2163         if (root) {
2164                 *root_device = root;
2165                 root = NULL;
2166
2167                 *root_device_rw = root_rw;
2168                 *secondary = false;
2169         } else if (secondary_root) {
2170                 *root_device = secondary_root;
2171                 secondary_root = NULL;
2172
2173                 *root_device_rw = secondary_root_rw;
2174                 *secondary = true;
2175         }
2176
2177         if (home) {
2178                 *home_device = home;
2179                 home = NULL;
2180
2181                 *home_device_rw = home_rw;
2182         }
2183
2184         if (srv) {
2185                 *srv_device = srv;
2186                 srv = NULL;
2187
2188                 *srv_device_rw = srv_rw;
2189         }
2190
2191         return 0;
2192 #else
2193         log_error("--image= is not supported, compiled without blkid support.");
2194         return -ENOTSUP;
2195 #endif
2196 }
2197
2198 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2199 #ifdef HAVE_BLKID
2200         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2201         const char *fstype, *p;
2202         int r;
2203
2204         assert(what);
2205         assert(where);
2206
2207         if (arg_read_only)
2208                 rw = false;
2209
2210         if (directory)
2211                 p = strappenda(where, directory);
2212         else
2213                 p = where;
2214
2215         errno = 0;
2216         b = blkid_new_probe_from_filename(what);
2217         if (!b) {
2218                 if (errno == 0)
2219                         return log_oom();
2220                 log_error("Failed to allocate prober for %s: %m", what);
2221                 return -errno;
2222         }
2223
2224         blkid_probe_enable_superblocks(b, 1);
2225         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2226
2227         errno = 0;
2228         r = blkid_do_safeprobe(b);
2229         if (r == -1 || r == 1) {
2230                 log_error("Cannot determine file system type of %s", what);
2231                 return -EINVAL;
2232         } else if (r != 0) {
2233                 if (errno == 0)
2234                         errno = EIO;
2235                 log_error("Failed to probe %s: %m", what);
2236                 return -errno;
2237         }
2238
2239         errno = 0;
2240         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2241                 if (errno == 0)
2242                         errno = EINVAL;
2243                 log_error("Failed to determine file system type of %s", what);
2244                 return -errno;
2245         }
2246
2247         if (streq(fstype, "crypto_LUKS")) {
2248                 log_error("nspawn currently does not support LUKS disk images.");
2249                 return -ENOTSUP;
2250         }
2251
2252         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2253                 log_error("Failed to mount %s: %m", what);
2254                 return -errno;
2255         }
2256
2257         return 0;
2258 #else
2259         log_error("--image= is not supported, compiled without blkid support.");
2260         return -ENOTSUP;
2261 #endif
2262 }
2263
2264 static int mount_devices(
2265                 const char *where,
2266                 const char *root_device, bool root_device_rw,
2267                 const char *home_device, bool home_device_rw,
2268                 const char *srv_device, bool srv_device_rw) {
2269         int r;
2270
2271         assert(where);
2272
2273         if (root_device) {
2274                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2275                 if (r < 0) {
2276                         log_error("Failed to mount root directory: %s", strerror(-r));
2277                         return r;
2278                 }
2279         }
2280
2281         if (home_device) {
2282                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2283                 if (r < 0) {
2284                         log_error("Failed to mount home directory: %s", strerror(-r));
2285                         return r;
2286                 }
2287         }
2288
2289         if (srv_device) {
2290                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2291                 if (r < 0) {
2292                         log_error("Failed to mount server data directory: %s", strerror(-r));
2293                         return r;
2294                 }
2295         }
2296
2297         return 0;
2298 }
2299
2300 static void loop_remove(int nr, int *image_fd) {
2301         _cleanup_close_ int control = -1;
2302
2303         if (nr < 0)
2304                 return;
2305
2306         if (image_fd && *image_fd >= 0) {
2307                 ioctl(*image_fd, LOOP_CLR_FD);
2308                 *image_fd = safe_close(*image_fd);
2309         }
2310
2311         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2312         if (control < 0)
2313                 return;
2314
2315         ioctl(control, LOOP_CTL_REMOVE, nr);
2316 }
2317
2318 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2319         int pipe_fds[2];
2320         pid_t pid;
2321
2322         assert(database);
2323         assert(key);
2324         assert(rpid);
2325
2326         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2327                 log_error("Failed to allocate pipe: %m");
2328                 return -errno;
2329         }
2330
2331         pid = fork();
2332         if (pid < 0) {
2333                 log_error("Failed to fork getent child: %m");
2334                 return -errno;
2335         } else if (pid == 0) {
2336                 int nullfd;
2337                 char *empty_env = NULL;
2338
2339                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2340                         _exit(EXIT_FAILURE);
2341
2342                 if (pipe_fds[0] > 2)
2343                         safe_close(pipe_fds[0]);
2344                 if (pipe_fds[1] > 2)
2345                         safe_close(pipe_fds[1]);
2346
2347                 nullfd = open("/dev/null", O_RDWR);
2348                 if (nullfd < 0)
2349                         _exit(EXIT_FAILURE);
2350
2351                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2352                         _exit(EXIT_FAILURE);
2353
2354                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2355                         _exit(EXIT_FAILURE);
2356
2357                 if (nullfd > 2)
2358                         safe_close(nullfd);
2359
2360                 reset_all_signal_handlers();
2361                 close_all_fds(NULL, 0);
2362
2363                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2364                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2365                 _exit(EXIT_FAILURE);
2366         }
2367
2368         pipe_fds[1] = safe_close(pipe_fds[1]);
2369
2370         *rpid = pid;
2371
2372         return pipe_fds[0];
2373 }
2374
2375 static int change_uid_gid(char **_home) {
2376         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2377         _cleanup_free_ uid_t *uids = NULL;
2378         _cleanup_free_ char *home = NULL;
2379         _cleanup_fclose_ FILE *f = NULL;
2380         _cleanup_close_ int fd = -1;
2381         unsigned n_uids = 0;
2382         size_t sz = 0, l;
2383         uid_t uid;
2384         gid_t gid;
2385         pid_t pid;
2386         int r;
2387
2388         assert(_home);
2389
2390         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2391                 /* Reset everything fully to 0, just in case */
2392
2393                 if (setgroups(0, NULL) < 0) {
2394                         log_error("setgroups() failed: %m");
2395                         return -errno;
2396                 }
2397
2398                 if (setresgid(0, 0, 0) < 0) {
2399                         log_error("setregid() failed: %m");
2400                         return -errno;
2401                 }
2402
2403                 if (setresuid(0, 0, 0) < 0) {
2404                         log_error("setreuid() failed: %m");
2405                         return -errno;
2406                 }
2407
2408                 *_home = NULL;
2409                 return 0;
2410         }
2411
2412         /* First, get user credentials */
2413         fd = spawn_getent("passwd", arg_user, &pid);
2414         if (fd < 0)
2415                 return fd;
2416
2417         f = fdopen(fd, "r");
2418         if (!f)
2419                 return log_oom();
2420         fd = -1;
2421
2422         if (!fgets(line, sizeof(line), f)) {
2423
2424                 if (!ferror(f)) {
2425                         log_error("Failed to resolve user %s.", arg_user);
2426                         return -ESRCH;
2427                 }
2428
2429                 log_error("Failed to read from getent: %m");
2430                 return -errno;
2431         }
2432
2433         truncate_nl(line);
2434
2435         wait_for_terminate_and_warn("getent passwd", pid);
2436
2437         x = strchr(line, ':');
2438         if (!x) {
2439                 log_error("/etc/passwd entry has invalid user field.");
2440                 return -EIO;
2441         }
2442
2443         u = strchr(x+1, ':');
2444         if (!u) {
2445                 log_error("/etc/passwd entry has invalid password field.");
2446                 return -EIO;
2447         }
2448
2449         u++;
2450         g = strchr(u, ':');
2451         if (!g) {
2452                 log_error("/etc/passwd entry has invalid UID field.");
2453                 return -EIO;
2454         }
2455
2456         *g = 0;
2457         g++;
2458         x = strchr(g, ':');
2459         if (!x) {
2460                 log_error("/etc/passwd entry has invalid GID field.");
2461                 return -EIO;
2462         }
2463
2464         *x = 0;
2465         h = strchr(x+1, ':');
2466         if (!h) {
2467                 log_error("/etc/passwd entry has invalid GECOS field.");
2468                 return -EIO;
2469         }
2470
2471         h++;
2472         x = strchr(h, ':');
2473         if (!x) {
2474                 log_error("/etc/passwd entry has invalid home directory field.");
2475                 return -EIO;
2476         }
2477
2478         *x = 0;
2479
2480         r = parse_uid(u, &uid);
2481         if (r < 0) {
2482                 log_error("Failed to parse UID of user.");
2483                 return -EIO;
2484         }
2485
2486         r = parse_gid(g, &gid);
2487         if (r < 0) {
2488                 log_error("Failed to parse GID of user.");
2489                 return -EIO;
2490         }
2491
2492         home = strdup(h);
2493         if (!home)
2494                 return log_oom();
2495
2496         /* Second, get group memberships */
2497         fd = spawn_getent("initgroups", arg_user, &pid);
2498         if (fd < 0)
2499                 return fd;
2500
2501         fclose(f);
2502         f = fdopen(fd, "r");
2503         if (!f)
2504                 return log_oom();
2505         fd = -1;
2506
2507         if (!fgets(line, sizeof(line), f)) {
2508                 if (!ferror(f)) {
2509                         log_error("Failed to resolve user %s.", arg_user);
2510                         return -ESRCH;
2511                 }
2512
2513                 log_error("Failed to read from getent: %m");
2514                 return -errno;
2515         }
2516
2517         truncate_nl(line);
2518
2519         wait_for_terminate_and_warn("getent initgroups", pid);
2520
2521         /* Skip over the username and subsequent separator whitespace */
2522         x = line;
2523         x += strcspn(x, WHITESPACE);
2524         x += strspn(x, WHITESPACE);
2525
2526         FOREACH_WORD(w, l, x, state) {
2527                 char c[l+1];
2528
2529                 memcpy(c, w, l);
2530                 c[l] = 0;
2531
2532                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2533                         return log_oom();
2534
2535                 r = parse_uid(c, &uids[n_uids++]);
2536                 if (r < 0) {
2537                         log_error("Failed to parse group data from getent.");
2538                         return -EIO;
2539                 }
2540         }
2541
2542         r = mkdir_parents(home, 0775);
2543         if (r < 0) {
2544                 log_error("Failed to make home root directory: %s", strerror(-r));
2545                 return r;
2546         }
2547
2548         r = mkdir_safe(home, 0755, uid, gid);
2549         if (r < 0 && r != -EEXIST) {
2550                 log_error("Failed to make home directory: %s", strerror(-r));
2551                 return r;
2552         }
2553
2554         fchown(STDIN_FILENO, uid, gid);
2555         fchown(STDOUT_FILENO, uid, gid);
2556         fchown(STDERR_FILENO, uid, gid);
2557
2558         if (setgroups(n_uids, uids) < 0) {
2559                 log_error("Failed to set auxiliary groups: %m");
2560                 return -errno;
2561         }
2562
2563         if (setresgid(gid, gid, gid) < 0) {
2564                 log_error("setregid() failed: %m");
2565                 return -errno;
2566         }
2567
2568         if (setresuid(uid, uid, uid) < 0) {
2569                 log_error("setreuid() failed: %m");
2570                 return -errno;
2571         }
2572
2573         if (_home) {
2574                 *_home = home;
2575                 home = NULL;
2576         }
2577
2578         return 0;
2579 }
2580
2581 /*
2582  * Return 0 in case the container is being rebooted, has been shut
2583  * down or exited successfully. On failures a negative value is
2584  * returned.
2585  *
2586  * The status of the container "CONTAINER_TERMINATED" or
2587  * "CONTAINER_REBOOTED" will be saved in the container argument
2588  */
2589 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2590         int r;
2591         siginfo_t status;
2592
2593         r = wait_for_terminate(pid, &status);
2594         if (r < 0)
2595                 return r;
2596
2597         switch (status.si_code) {
2598         case CLD_EXITED:
2599                 r = status.si_status;
2600                 if (r == 0) {
2601                         if (!arg_quiet)
2602                                 log_debug("Container %s exited successfully.",
2603                                           arg_machine);
2604
2605                         *container = CONTAINER_TERMINATED;
2606                 } else {
2607                         log_error("Container %s failed with error code %i.",
2608                                   arg_machine, status.si_status);
2609                         r = -1;
2610                 }
2611                 break;
2612
2613         case CLD_KILLED:
2614                 if (status.si_status == SIGINT) {
2615                         if (!arg_quiet)
2616                                 log_info("Container %s has been shut down.",
2617                                          arg_machine);
2618
2619                         *container = CONTAINER_TERMINATED;
2620                         r = 0;
2621                         break;
2622                 } else if (status.si_status == SIGHUP) {
2623                         if (!arg_quiet)
2624                                 log_info("Container %s is being rebooted.",
2625                                          arg_machine);
2626
2627                         *container = CONTAINER_REBOOTED;
2628                         r = 0;
2629                         break;
2630                 }
2631                 /* CLD_KILLED fallthrough */
2632
2633         case CLD_DUMPED:
2634                 log_error("Container %s terminated by signal %s.",
2635                           arg_machine, signal_to_string(status.si_status));
2636                 r = -1;
2637                 break;
2638
2639         default:
2640                 log_error("Container %s failed due to unknown reason.",
2641                           arg_machine);
2642                 r = -1;
2643                 break;
2644         }
2645
2646         return r;
2647 }
2648
2649 static void nop_handler(int sig) {}
2650
2651 int main(int argc, char *argv[]) {
2652
2653         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2654         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2655         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2656         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2657         _cleanup_fdset_free_ FDSet *fds = NULL;
2658         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2659         const char *console = NULL;
2660         char veth_name[IFNAMSIZ];
2661         bool secondary = false;
2662         sigset_t mask, mask_chld;
2663         pid_t pid = 0;
2664
2665         log_parse_environment();
2666         log_open();
2667
2668         k = parse_argv(argc, argv);
2669         if (k < 0)
2670                 goto finish;
2671         else if (k == 0) {
2672                 r = EXIT_SUCCESS;
2673                 goto finish;
2674         }
2675
2676         if (!arg_image) {
2677                 if (arg_directory) {
2678                         char *p;
2679
2680                         p = path_make_absolute_cwd(arg_directory);
2681                         free(arg_directory);
2682                         arg_directory = p;
2683                 } else
2684                         arg_directory = get_current_dir_name();
2685
2686                 if (!arg_directory) {
2687                         log_error("Failed to determine path, please use -D.");
2688                         goto finish;
2689                 }
2690                 path_kill_slashes(arg_directory);
2691         }
2692
2693         if (!arg_machine) {
2694                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2695                 if (!arg_machine) {
2696                         log_oom();
2697                         goto finish;
2698                 }
2699
2700                 hostname_cleanup(arg_machine, false);
2701                 if (isempty(arg_machine)) {
2702                         log_error("Failed to determine machine name automatically, please use -M.");
2703                         goto finish;
2704                 }
2705         }
2706
2707         if (geteuid() != 0) {
2708                 log_error("Need to be root.");
2709                 goto finish;
2710         }
2711
2712         if (sd_booted() <= 0) {
2713                 log_error("Not running on a systemd system.");
2714                 goto finish;
2715         }
2716
2717         log_close();
2718         n_fd_passed = sd_listen_fds(false);
2719         if (n_fd_passed > 0) {
2720                 k = fdset_new_listen_fds(&fds, false);
2721                 if (k < 0) {
2722                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2723                         goto finish;
2724                 }
2725         }
2726         fdset_close_others(fds);
2727         log_open();
2728
2729         if (arg_directory) {
2730                 if (path_equal(arg_directory, "/")) {
2731                         log_error("Spawning container on root directory not supported.");
2732                         goto finish;
2733                 }
2734
2735                 if (arg_boot) {
2736                         if (path_is_os_tree(arg_directory) <= 0) {
2737                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2738                                 goto finish;
2739                         }
2740                 } else {
2741                         const char *p;
2742
2743                         p = strappenda(arg_directory,
2744                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2745                         if (access(p, F_OK) < 0) {
2746                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2747                                 goto finish;
2748
2749                         }
2750                 }
2751         } else {
2752                 char template[] = "/tmp/nspawn-root-XXXXXX";
2753
2754                 if (!mkdtemp(template)) {
2755                         log_error("Failed to create temporary directory: %m");
2756                         r = -errno;
2757                         goto finish;
2758                 }
2759
2760                 arg_directory = strdup(template);
2761                 if (!arg_directory) {
2762                         r = log_oom();
2763                         goto finish;
2764                 }
2765
2766                 image_fd = setup_image(&device_path, &loop_nr);
2767                 if (image_fd < 0) {
2768                         r = image_fd;
2769                         goto finish;
2770                 }
2771
2772                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2773                 if (r < 0)
2774                         goto finish;
2775         }
2776
2777         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2778         if (master < 0) {
2779                 log_error("Failed to acquire pseudo tty: %m");
2780                 goto finish;
2781         }
2782
2783         console = ptsname(master);
2784         if (!console) {
2785                 log_error("Failed to determine tty name: %m");
2786                 goto finish;
2787         }
2788
2789         if (!arg_quiet)
2790                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2791
2792         if (unlockpt(master) < 0) {
2793                 log_error("Failed to unlock tty: %m");
2794                 goto finish;
2795         }
2796
2797         if (access("/dev/kdbus/control", F_OK) >= 0) {
2798
2799                 if (arg_share_system) {
2800                         kdbus_domain = strdup("/dev/kdbus");
2801                         if (!kdbus_domain) {
2802                                 log_oom();
2803                                 goto finish;
2804                         }
2805                 } else {
2806                         const char *ns;
2807
2808                         ns = strappenda("machine-", arg_machine);
2809                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2810                         if (r < 0)
2811                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2812                         else
2813                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2814                 }
2815         }
2816
2817         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2818                 log_error("Failed to create kmsg socket pair: %m");
2819                 goto finish;
2820         }
2821
2822         sd_notify(0, "READY=1");
2823
2824         assert_se(sigemptyset(&mask) == 0);
2825         assert_se(sigemptyset(&mask_chld) == 0);
2826         sigaddset(&mask_chld, SIGCHLD);
2827         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2828         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2829
2830         for (;;) {
2831                 ContainerStatus container_status;
2832                 int eventfds[2] = { -1, -1 };
2833                 struct sigaction sa = {
2834                         .sa_handler = nop_handler,
2835                         .sa_flags = SA_NOCLDSTOP,
2836                 };
2837
2838                 /* Child can be killed before execv(), so handle SIGCHLD
2839                  * in order to interrupt parent's blocking calls and
2840                  * give it a chance to call wait() and terminate. */
2841                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2842                 if (r < 0) {
2843                         log_error("Failed to change the signal mask: %m");
2844                         goto finish;
2845                 }
2846
2847                 r = sigaction(SIGCHLD, &sa, NULL);
2848                 if (r < 0) {
2849                         log_error("Failed to install SIGCHLD handler: %m");
2850                         goto finish;
2851                 }
2852
2853                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2854                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2855                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2856                 if (pid < 0) {
2857                         if (errno == EINVAL)
2858                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2859                         else
2860                                 log_error("clone() failed: %m");
2861
2862                         r = pid;
2863                         goto finish;
2864                 }
2865
2866                 if (pid == 0) {
2867                         /* child */
2868                         _cleanup_free_ char *home = NULL;
2869                         unsigned n_env = 2;
2870                         const char *envp[] = {
2871                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2872                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2873                                 NULL, /* TERM */
2874                                 NULL, /* HOME */
2875                                 NULL, /* USER */
2876                                 NULL, /* LOGNAME */
2877                                 NULL, /* container_uuid */
2878                                 NULL, /* LISTEN_FDS */
2879                                 NULL, /* LISTEN_PID */
2880                                 NULL
2881                         };
2882                         char **env_use;
2883
2884                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2885                         if (envp[n_env])
2886                                 n_env ++;
2887
2888                         master = safe_close(master);
2889
2890                         close_nointr(STDIN_FILENO);
2891                         close_nointr(STDOUT_FILENO);
2892                         close_nointr(STDERR_FILENO);
2893
2894                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2895
2896                         reset_all_signal_handlers();
2897
2898                         assert_se(sigemptyset(&mask) == 0);
2899                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2900
2901                         k = open_terminal(console, O_RDWR);
2902                         if (k != STDIN_FILENO) {
2903                                 if (k >= 0) {
2904                                         safe_close(k);
2905                                         k = -EINVAL;
2906                                 }
2907
2908                                 log_error("Failed to open console: %s", strerror(-k));
2909                                 goto child_fail;
2910                         }
2911
2912                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2913                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2914                                 log_error("Failed to duplicate console: %m");
2915                                 goto child_fail;
2916                         }
2917
2918                         if (setsid() < 0) {
2919                                 log_error("setsid() failed: %m");
2920                                 goto child_fail;
2921                         }
2922
2923                         if (reset_audit_loginuid() < 0)
2924                                 goto child_fail;
2925
2926                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2927                                 log_error("PR_SET_PDEATHSIG failed: %m");
2928                                 goto child_fail;
2929                         }
2930
2931                         /* Mark everything as slave, so that we still
2932                          * receive mounts from the real root, but don't
2933                          * propagate mounts to the real root. */
2934                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2935                                 log_error("MS_SLAVE|MS_REC failed: %m");
2936                                 goto child_fail;
2937                         }
2938
2939                         if (mount_devices(arg_directory,
2940                                           root_device, root_device_rw,
2941                                           home_device, home_device_rw,
2942                                           srv_device, srv_device_rw) < 0)
2943                                 goto child_fail;
2944
2945                         /* Turn directory into bind mount */
2946                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2947                                 log_error("Failed to make bind mount: %m");
2948                                 goto child_fail;
2949                         }
2950
2951                         if (arg_read_only) {
2952                                 k = bind_remount_recursive(arg_directory, true);
2953                                 if (k < 0) {
2954                                         log_error("Failed to make tree read-only: %s", strerror(-k));
2955                                         goto child_fail;
2956                                 }
2957                         }
2958
2959                         if (mount_all(arg_directory) < 0)
2960                                 goto child_fail;
2961
2962                         if (copy_devnodes(arg_directory) < 0)
2963                                 goto child_fail;
2964
2965                         if (setup_ptmx(arg_directory) < 0)
2966                                 goto child_fail;
2967
2968                         dev_setup(arg_directory);
2969
2970                         if (audit_still_doesnt_work_in_containers() < 0)
2971                                 goto child_fail;
2972
2973                         if (setup_dev_console(arg_directory, console) < 0)
2974                                 goto child_fail;
2975
2976                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2977                                 goto child_fail;
2978
2979                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
2980
2981                         if (setup_boot_id(arg_directory) < 0)
2982                                 goto child_fail;
2983
2984                         if (setup_timezone(arg_directory) < 0)
2985                                 goto child_fail;
2986
2987                         if (setup_resolv_conf(arg_directory) < 0)
2988                                 goto child_fail;
2989
2990                         if (setup_journal(arg_directory) < 0)
2991                                 goto child_fail;
2992
2993                         if (mount_binds(arg_directory, arg_bind, false) < 0)
2994                                 goto child_fail;
2995
2996                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
2997                                 goto child_fail;
2998
2999                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3000                                 goto child_fail;
3001
3002                         /* Tell the parent that we are ready, and that
3003                          * it can cgroupify us to that we lack access
3004                          * to certain devices and resources. */
3005                         r = eventfd_send_state(eventfds[1],
3006                                                EVENTFD_CHILD_SUCCEEDED);
3007                         eventfds[1] = safe_close(eventfds[1]);
3008                         if (r < 0)
3009                                 goto child_fail;
3010
3011                         if (chdir(arg_directory) < 0) {
3012                                 log_error("chdir(%s) failed: %m", arg_directory);
3013                                 goto child_fail;
3014                         }
3015
3016                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3017                                 log_error("mount(MS_MOVE) failed: %m");
3018                                 goto child_fail;
3019                         }
3020
3021                         if (chroot(".") < 0) {
3022                                 log_error("chroot() failed: %m");
3023                                 goto child_fail;
3024                         }
3025
3026                         if (chdir("/") < 0) {
3027                                 log_error("chdir() failed: %m");
3028                                 goto child_fail;
3029                         }
3030
3031                         umask(0022);
3032
3033                         if (arg_private_network)
3034                                 loopback_setup();
3035
3036                         if (drop_capabilities() < 0) {
3037                                 log_error("drop_capabilities() failed: %m");
3038                                 goto child_fail;
3039                         }
3040
3041                         r = change_uid_gid(&home);
3042                         if (r < 0)
3043                                 goto child_fail;
3044
3045                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3046                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3047                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3048                                 log_oom();
3049                                 goto child_fail;
3050                         }
3051
3052                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3053                                 char as_uuid[37];
3054
3055                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3056                                         log_oom();
3057                                         goto child_fail;
3058                                 }
3059                         }
3060
3061                         if (fdset_size(fds) > 0) {
3062                                 k = fdset_cloexec(fds, false);
3063                                 if (k < 0) {
3064                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3065                                         goto child_fail;
3066                                 }
3067
3068                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3069                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3070                                         log_oom();
3071                                         goto child_fail;
3072                                 }
3073                         }
3074
3075                         setup_hostname();
3076
3077                         if (arg_personality != 0xffffffffLU) {
3078                                 if (personality(arg_personality) < 0) {
3079                                         log_error("personality() failed: %m");
3080                                         goto child_fail;
3081                                 }
3082                         } else if (secondary) {
3083                                 if (personality(PER_LINUX32) < 0) {
3084                                         log_error("personality() failed: %m");
3085                                         goto child_fail;
3086                                 }
3087                         }
3088
3089 #ifdef HAVE_SELINUX
3090                         if (arg_selinux_context)
3091                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3092                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3093                                         goto child_fail;
3094                                 }
3095 #endif
3096
3097                         if (!strv_isempty(arg_setenv)) {
3098                                 char **n;
3099
3100                                 n = strv_env_merge(2, envp, arg_setenv);
3101                                 if (!n) {
3102                                         log_oom();
3103                                         goto child_fail;
3104                                 }
3105
3106                                 env_use = n;
3107                         } else
3108                                 env_use = (char**) envp;
3109
3110                         /* Wait until the parent is ready with the setup, too... */
3111                         r = eventfd_parent_succeeded(eventfds[0]);
3112                         eventfds[0] = safe_close(eventfds[0]);
3113                         if (r < 0)
3114                                 goto child_fail;
3115
3116                         if (arg_boot) {
3117                                 char **a;
3118                                 size_t l;
3119
3120                                 /* Automatically search for the init system */
3121
3122                                 l = 1 + argc - optind;
3123                                 a = newa(char*, l + 1);
3124                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3125
3126                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3127                                 execve(a[0], a, env_use);
3128
3129                                 a[0] = (char*) "/lib/systemd/systemd";
3130                                 execve(a[0], a, env_use);
3131
3132                                 a[0] = (char*) "/sbin/init";
3133                                 execve(a[0], a, env_use);
3134                         } else if (argc > optind)
3135                                 execvpe(argv[optind], argv + optind, env_use);
3136                         else {
3137                                 chdir(home ? home : "/root");
3138                                 execle("/bin/bash", "-bash", NULL, env_use);
3139                                 execle("/bin/sh", "-sh", NULL, env_use);
3140                         }
3141
3142                         log_error("execv() failed: %m");
3143
3144                 child_fail:
3145                         /* Tell the parent that the setup failed, so he
3146                          * can clean up resources and terminate. */
3147                         if (eventfds[1] != -1)
3148                                 eventfd_send_state(eventfds[1],
3149                                                    EVENTFD_CHILD_FAILED);
3150                         _exit(EXIT_FAILURE);
3151                 }
3152
3153                 fdset_free(fds);
3154                 fds = NULL;
3155
3156                 /* Wait for the child event:
3157                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3158                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3159                  * it is ready with all it needs to do with priviliges.
3160                  * After we got the notification we can make the process
3161                  * join its cgroup which might limit what it can do */
3162                 r = eventfd_child_succeeded(eventfds[1]);
3163                 eventfds[1] = safe_close(eventfds[1]);
3164                 if (r < 0)
3165                         goto check_container_status;
3166
3167                 r = register_machine(pid);
3168                 if (r < 0)
3169                         goto finish;
3170
3171                 r = move_network_interfaces(pid);
3172                 if (r < 0)
3173                         goto finish;
3174
3175                 r = setup_veth(pid, veth_name);
3176                 if (r < 0)
3177                         goto finish;
3178
3179                 r = setup_bridge(veth_name);
3180                 if (r < 0)
3181                         goto finish;
3182
3183                 r = setup_macvlan(pid);
3184                 if (r < 0)
3185                         goto finish;
3186
3187                 /* Block SIGCHLD here, before notifying child.
3188                  * process_pty() will handle it with the other signals. */
3189                 r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3190                 if (r < 0)
3191                         goto finish;
3192
3193                 /* Reset signal to default */
3194                 r = default_signals(SIGCHLD, -1);
3195                 if (r < 0)
3196                         goto finish;
3197
3198                 /* Notify the child that the parent is ready with all
3199                  * its setup, and that the child can now hand over
3200                  * control to the code to run inside the container. */
3201                 r = eventfd_send_state(eventfds[0],
3202                                        EVENTFD_PARENT_SUCCEEDED);
3203                 eventfds[0] = safe_close(eventfds[0]);
3204                 if (r < 0)
3205                         goto finish;
3206
3207                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3208                 if (k < 0) {
3209                         r = EXIT_FAILURE;
3210                         break;
3211                 }
3212
3213                 if (!arg_quiet)
3214                         putc('\n', stdout);
3215
3216                 /* Kill if it is not dead yet anyway */
3217                 terminate_machine(pid);
3218
3219 check_container_status:
3220                 /* Redundant, but better safe than sorry */
3221                 kill(pid, SIGKILL);
3222
3223                 r = wait_for_container(pid, &container_status);
3224                 pid = 0;
3225
3226                 if (r < 0) {
3227                         r = EXIT_FAILURE;
3228                         break;
3229                 } else if (container_status == CONTAINER_TERMINATED)
3230                         break;
3231
3232                 /* CONTAINER_REBOOTED, loop again */
3233         }
3234
3235 finish:
3236         loop_remove(loop_nr, &image_fd);
3237
3238         if (pid > 0)
3239                 kill(pid, SIGKILL);
3240
3241         free(arg_directory);
3242         free(arg_machine);
3243         free(arg_user);
3244         strv_free(arg_setenv);
3245         strv_free(arg_network_interfaces);
3246         strv_free(arg_network_macvlan);
3247         strv_free(arg_bind);
3248         strv_free(arg_bind_ro);
3249
3250         return r;
3251 }