chiark / gitweb /
nspawn: block open_by_handle_at() and others via seccomp
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "eventfd-util.h"
88 #include "blkid-util.h"
89 #include "gpt.h"
90 #include "siphash24.h"
91 #include "copy.h"
92 #include "base-filesystem.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 static char *arg_directory = NULL;
111 static char *arg_user = NULL;
112 static sd_id128_t arg_uuid = {};
113 static char *arg_machine = NULL;
114 static const char *arg_selinux_context = NULL;
115 static const char *arg_selinux_apifs_context = NULL;
116 static const char *arg_slice = NULL;
117 static bool arg_private_network = false;
118 static bool arg_read_only = false;
119 static bool arg_boot = false;
120 static LinkJournal arg_link_journal = LINK_AUTO;
121 static uint64_t arg_retain =
122         (1ULL << CAP_CHOWN) |
123         (1ULL << CAP_DAC_OVERRIDE) |
124         (1ULL << CAP_DAC_READ_SEARCH) |
125         (1ULL << CAP_FOWNER) |
126         (1ULL << CAP_FSETID) |
127         (1ULL << CAP_IPC_OWNER) |
128         (1ULL << CAP_KILL) |
129         (1ULL << CAP_LEASE) |
130         (1ULL << CAP_LINUX_IMMUTABLE) |
131         (1ULL << CAP_NET_BIND_SERVICE) |
132         (1ULL << CAP_NET_BROADCAST) |
133         (1ULL << CAP_NET_RAW) |
134         (1ULL << CAP_SETGID) |
135         (1ULL << CAP_SETFCAP) |
136         (1ULL << CAP_SETPCAP) |
137         (1ULL << CAP_SETUID) |
138         (1ULL << CAP_SYS_ADMIN) |
139         (1ULL << CAP_SYS_CHROOT) |
140         (1ULL << CAP_SYS_NICE) |
141         (1ULL << CAP_SYS_PTRACE) |
142         (1ULL << CAP_SYS_TTY_CONFIG) |
143         (1ULL << CAP_SYS_RESOURCE) |
144         (1ULL << CAP_SYS_BOOT) |
145         (1ULL << CAP_AUDIT_WRITE) |
146         (1ULL << CAP_AUDIT_CONTROL) |
147         (1ULL << CAP_MKNOD);
148 static char **arg_bind = NULL;
149 static char **arg_bind_ro = NULL;
150 static char **arg_tmpfs = NULL;
151 static char **arg_setenv = NULL;
152 static bool arg_quiet = false;
153 static bool arg_share_system = false;
154 static bool arg_register = true;
155 static bool arg_keep_unit = false;
156 static char **arg_network_interfaces = NULL;
157 static char **arg_network_macvlan = NULL;
158 static bool arg_network_veth = false;
159 static const char *arg_network_bridge = NULL;
160 static unsigned long arg_personality = 0xffffffffLU;
161 static const char *arg_image = NULL;
162
163 static int help(void) {
164
165         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
166                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
167                "  -h --help                 Show this help\n"
168                "     --version              Print version string\n"
169                "  -q --quiet                Do not show status information\n"
170                "  -D --directory=PATH       Root directory for the container\n"
171                "  -i --image=PATH           File system device or image for the container\n"
172                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
173                "  -u --user=USER            Run the command under specified user or uid\n"
174                "  -M --machine=NAME         Set the machine name for the container\n"
175                "     --uuid=UUID            Set a specific machine UUID for the container\n"
176                "  -S --slice=SLICE          Place the container in the specified slice\n"
177                "     --private-network      Disable network in container\n"
178                "     --network-interface=INTERFACE\n"
179                "                            Assign an existing network interface to the\n"
180                "                            container\n"
181                "     --network-macvlan=INTERFACE\n"
182                "                            Create a macvlan network interface based on an\n"
183                "                            existing network interface to the container\n"
184                "     --network-veth         Add a virtual ethernet connection between host\n"
185                "                            and container\n"
186                "     --network-bridge=INTERFACE\n"
187                "                            Add a virtual ethernet connection between host\n"
188                "                            and container and add it to an existing bridge on\n"
189                "                            the host\n"
190                "  -Z --selinux-context=SECLABEL\n"
191                "                            Set the SELinux security context to be used by\n"
192                "                            processes in the container\n"
193                "  -L --selinux-apifs-context=SECLABEL\n"
194                "                            Set the SELinux security context to be used by\n"
195                "                            API/tmpfs file systems in the container\n"
196                "     --capability=CAP       In addition to the default, retain specified\n"
197                "                            capability\n"
198                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
199                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
200                "  -j                        Equivalent to --link-journal=host\n"
201                "     --read-only            Mount the root directory read-only\n"
202                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
203                "                            the container\n"
204                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
205                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
206                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
207                "     --share-system         Share system namespaces with host\n"
208                "     --register=BOOLEAN     Register container as machine\n"
209                "     --keep-unit            Do not register a scope for the machine, reuse\n"
210                "                            the service unit nspawn is running in\n",
211                program_invocation_short_name);
212
213         return 0;
214 }
215
216 static int parse_argv(int argc, char *argv[]) {
217
218         enum {
219                 ARG_VERSION = 0x100,
220                 ARG_PRIVATE_NETWORK,
221                 ARG_UUID,
222                 ARG_READ_ONLY,
223                 ARG_CAPABILITY,
224                 ARG_DROP_CAPABILITY,
225                 ARG_LINK_JOURNAL,
226                 ARG_BIND,
227                 ARG_BIND_RO,
228                 ARG_TMPFS,
229                 ARG_SETENV,
230                 ARG_SHARE_SYSTEM,
231                 ARG_REGISTER,
232                 ARG_KEEP_UNIT,
233                 ARG_NETWORK_INTERFACE,
234                 ARG_NETWORK_MACVLAN,
235                 ARG_NETWORK_VETH,
236                 ARG_NETWORK_BRIDGE,
237                 ARG_PERSONALITY,
238         };
239
240         static const struct option options[] = {
241                 { "help",                  no_argument,       NULL, 'h'                   },
242                 { "version",               no_argument,       NULL, ARG_VERSION           },
243                 { "directory",             required_argument, NULL, 'D'                   },
244                 { "user",                  required_argument, NULL, 'u'                   },
245                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
246                 { "boot",                  no_argument,       NULL, 'b'                   },
247                 { "uuid",                  required_argument, NULL, ARG_UUID              },
248                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
249                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
250                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
251                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
252                 { "bind",                  required_argument, NULL, ARG_BIND              },
253                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
254                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
255                 { "machine",               required_argument, NULL, 'M'                   },
256                 { "slice",                 required_argument, NULL, 'S'                   },
257                 { "setenv",                required_argument, NULL, ARG_SETENV            },
258                 { "selinux-context",       required_argument, NULL, 'Z'                   },
259                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
260                 { "quiet",                 no_argument,       NULL, 'q'                   },
261                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
262                 { "register",              required_argument, NULL, ARG_REGISTER          },
263                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
264                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
265                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
266                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
267                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
268                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
269                 { "image",                 required_argument, NULL, 'i'                   },
270                 {}
271         };
272
273         int c, r;
274         uint64_t plus = 0, minus = 0;
275
276         assert(argc >= 0);
277         assert(argv);
278
279         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
280
281                 switch (c) {
282
283                 case 'h':
284                         return help();
285
286                 case ARG_VERSION:
287                         puts(PACKAGE_STRING);
288                         puts(SYSTEMD_FEATURES);
289                         return 0;
290
291                 case 'D':
292                         free(arg_directory);
293                         arg_directory = canonicalize_file_name(optarg);
294                         if (!arg_directory) {
295                                 log_error("Invalid root directory: %m");
296                                 return -ENOMEM;
297                         }
298
299                         break;
300
301                 case 'i':
302                         arg_image = optarg;
303                         break;
304
305                 case 'u':
306                         free(arg_user);
307                         arg_user = strdup(optarg);
308                         if (!arg_user)
309                                 return log_oom();
310
311                         break;
312
313                 case ARG_NETWORK_BRIDGE:
314                         arg_network_bridge = optarg;
315
316                         /* fall through */
317
318                 case ARG_NETWORK_VETH:
319                         arg_network_veth = true;
320                         arg_private_network = true;
321                         break;
322
323                 case ARG_NETWORK_INTERFACE:
324                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
325                                 return log_oom();
326
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_MACVLAN:
331                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
332                                 return log_oom();
333
334                         /* fall through */
335
336                 case ARG_PRIVATE_NETWORK:
337                         arg_private_network = true;
338                         break;
339
340                 case 'b':
341                         arg_boot = true;
342                         break;
343
344                 case ARG_UUID:
345                         r = sd_id128_from_string(optarg, &arg_uuid);
346                         if (r < 0) {
347                                 log_error("Invalid UUID: %s", optarg);
348                                 return r;
349                         }
350                         break;
351
352                 case 'S':
353                         arg_slice = optarg;
354                         break;
355
356                 case 'M':
357                         if (isempty(optarg)) {
358                                 free(arg_machine);
359                                 arg_machine = NULL;
360                         } else {
361
362                                 if (!hostname_is_valid(optarg)) {
363                                         log_error("Invalid machine name: %s", optarg);
364                                         return -EINVAL;
365                                 }
366
367                                 free(arg_machine);
368                                 arg_machine = strdup(optarg);
369                                 if (!arg_machine)
370                                         return log_oom();
371
372                                 break;
373                         }
374
375                 case 'Z':
376                         arg_selinux_context = optarg;
377                         break;
378
379                 case 'L':
380                         arg_selinux_apifs_context = optarg;
381                         break;
382
383                 case ARG_READ_ONLY:
384                         arg_read_only = true;
385                         break;
386
387                 case ARG_CAPABILITY:
388                 case ARG_DROP_CAPABILITY: {
389                         char *state, *word;
390                         size_t length;
391
392                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
393                                 _cleanup_free_ char *t;
394                                 cap_value_t cap;
395
396                                 t = strndup(word, length);
397                                 if (!t)
398                                         return log_oom();
399
400                                 if (streq(t, "all")) {
401                                         if (c == ARG_CAPABILITY)
402                                                 plus = (uint64_t) -1;
403                                         else
404                                                 minus = (uint64_t) -1;
405                                 } else {
406                                         if (cap_from_name(t, &cap) < 0) {
407                                                 log_error("Failed to parse capability %s.", t);
408                                                 return -EINVAL;
409                                         }
410
411                                         if (c == ARG_CAPABILITY)
412                                                 plus |= 1ULL << (uint64_t) cap;
413                                         else
414                                                 minus |= 1ULL << (uint64_t) cap;
415                                 }
416                         }
417
418                         break;
419                 }
420
421                 case 'j':
422                         arg_link_journal = LINK_GUEST;
423                         break;
424
425                 case ARG_LINK_JOURNAL:
426                         if (streq(optarg, "auto"))
427                                 arg_link_journal = LINK_AUTO;
428                         else if (streq(optarg, "no"))
429                                 arg_link_journal = LINK_NO;
430                         else if (streq(optarg, "guest"))
431                                 arg_link_journal = LINK_GUEST;
432                         else if (streq(optarg, "host"))
433                                 arg_link_journal = LINK_HOST;
434                         else {
435                                 log_error("Failed to parse link journal mode %s", optarg);
436                                 return -EINVAL;
437                         }
438
439                         break;
440
441                 case ARG_BIND:
442                 case ARG_BIND_RO: {
443                         _cleanup_free_ char *a = NULL, *b = NULL;
444                         char *e;
445                         char ***x;
446
447                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
448
449                         e = strchr(optarg, ':');
450                         if (e) {
451                                 a = strndup(optarg, e - optarg);
452                                 b = strdup(e + 1);
453                         } else {
454                                 a = strdup(optarg);
455                                 b = strdup(optarg);
456                         }
457
458                         if (!a || !b)
459                                 return log_oom();
460
461                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
462                                 log_error("Invalid bind mount specification: %s", optarg);
463                                 return -EINVAL;
464                         }
465
466                         r = strv_extend(x, a);
467                         if (r < 0)
468                                 return log_oom();
469
470                         r = strv_extend(x, b);
471                         if (r < 0)
472                                 return log_oom();
473
474                         break;
475                 }
476
477                 case ARG_TMPFS: {
478                         _cleanup_free_ char *a = NULL, *b = NULL;
479                         char *e;
480
481                         e = strchr(optarg, ':');
482                         if (e) {
483                                 a = strndup(optarg, e - optarg);
484                                 b = strdup(e + 1);
485                         } else {
486                                 a = strdup(optarg);
487                                 b = strdup("mode=0755");
488                         }
489
490                         if (!a || !b)
491                                 return log_oom();
492
493                         if (!path_is_absolute(a)) {
494                                 log_error("Invalid tmpfs specification: %s", optarg);
495                                 return -EINVAL;
496                         }
497
498                         r = strv_push(&arg_tmpfs, a);
499                         if (r < 0)
500                                 return log_oom();
501
502                         a = NULL;
503
504                         r = strv_push(&arg_tmpfs, b);
505                         if (r < 0)
506                                 return log_oom();
507
508                         b = NULL;
509
510                         break;
511                 }
512
513                 case ARG_SETENV: {
514                         char **n;
515
516                         if (!env_assignment_is_valid(optarg)) {
517                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
518                                 return -EINVAL;
519                         }
520
521                         n = strv_env_set(arg_setenv, optarg);
522                         if (!n)
523                                 return log_oom();
524
525                         strv_free(arg_setenv);
526                         arg_setenv = n;
527                         break;
528                 }
529
530                 case 'q':
531                         arg_quiet = true;
532                         break;
533
534                 case ARG_SHARE_SYSTEM:
535                         arg_share_system = true;
536                         break;
537
538                 case ARG_REGISTER:
539                         r = parse_boolean(optarg);
540                         if (r < 0) {
541                                 log_error("Failed to parse --register= argument: %s", optarg);
542                                 return r;
543                         }
544
545                         arg_register = r;
546                         break;
547
548                 case ARG_KEEP_UNIT:
549                         arg_keep_unit = true;
550                         break;
551
552                 case ARG_PERSONALITY:
553
554                         arg_personality = personality_from_string(optarg);
555                         if (arg_personality == 0xffffffffLU) {
556                                 log_error("Unknown or unsupported personality '%s'.", optarg);
557                                 return -EINVAL;
558                         }
559
560                         break;
561
562                 case '?':
563                         return -EINVAL;
564
565                 default:
566                         assert_not_reached("Unhandled option");
567                 }
568         }
569
570         if (arg_share_system)
571                 arg_register = false;
572
573         if (arg_boot && arg_share_system) {
574                 log_error("--boot and --share-system may not be combined.");
575                 return -EINVAL;
576         }
577
578         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
579                 log_error("--keep-unit may not be used when invoked from a user session.");
580                 return -EINVAL;
581         }
582
583         if (arg_directory && arg_image) {
584                 log_error("--directory= and --image= may not be combined.");
585                 return -EINVAL;
586         }
587
588         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
589
590         return 1;
591 }
592
593 static int mount_all(const char *dest) {
594
595         typedef struct MountPoint {
596                 const char *what;
597                 const char *where;
598                 const char *type;
599                 const char *options;
600                 unsigned long flags;
601                 bool fatal;
602         } MountPoint;
603
604         static const MountPoint mount_table[] = {
605                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
606                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
607                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
608                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
609                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
610                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
611                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
612                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
613 #ifdef HAVE_SELINUX
614                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
615                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
616 #endif
617         };
618
619         unsigned k;
620         int r = 0;
621
622         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
623                 _cleanup_free_ char *where = NULL;
624 #ifdef HAVE_SELINUX
625                 _cleanup_free_ char *options = NULL;
626 #endif
627                 const char *o;
628                 int t;
629
630                 where = strjoin(dest, "/", mount_table[k].where, NULL);
631                 if (!where)
632                         return log_oom();
633
634                 t = path_is_mount_point(where, true);
635                 if (t < 0) {
636                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
637
638                         if (r == 0)
639                                 r = t;
640
641                         continue;
642                 }
643
644                 /* Skip this entry if it is not a remount. */
645                 if (mount_table[k].what && t > 0)
646                         continue;
647
648                 mkdir_p(where, 0755);
649
650 #ifdef HAVE_SELINUX
651                 if (arg_selinux_apifs_context &&
652                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
653                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
654                         if (!options)
655                                 return log_oom();
656
657                         o = options;
658                 } else
659 #endif
660                         o = mount_table[k].options;
661
662
663                 if (mount(mount_table[k].what,
664                           where,
665                           mount_table[k].type,
666                           mount_table[k].flags,
667                           o) < 0 &&
668                     mount_table[k].fatal) {
669
670                         log_error("mount(%s) failed: %m", where);
671
672                         if (r == 0)
673                                 r = -errno;
674                 }
675         }
676
677         return r;
678 }
679
680 static int mount_binds(const char *dest, char **l, bool ro) {
681         char **x, **y;
682
683         STRV_FOREACH_PAIR(x, y, l) {
684                 _cleanup_free_ char *where = NULL;
685                 struct stat source_st, dest_st;
686                 int r;
687
688                 if (stat(*x, &source_st) < 0) {
689                         log_error("Failed to stat %s: %m", *x);
690                         return -errno;
691                 }
692
693                 where = strappend(dest, *y);
694                 if (!where)
695                         return log_oom();
696
697                 r = stat(where, &dest_st);
698                 if (r == 0) {
699                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
700                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
701                                 return -EINVAL;
702                         }
703                 } else if (errno == ENOENT) {
704                         r = mkdir_parents_label(where, 0755);
705                         if (r < 0) {
706                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
707                                 return r;
708                         }
709                 } else {
710                         log_error("Failed to bind mount %s: %m", *x);
711                         return -errno;
712                 }
713
714                 /* Create the mount point, but be conservative -- refuse to create block
715                 * and char devices. */
716                 if (S_ISDIR(source_st.st_mode))
717                         mkdir_label(where, 0755);
718                 else if (S_ISFIFO(source_st.st_mode))
719                         mkfifo(where, 0644);
720                 else if (S_ISSOCK(source_st.st_mode))
721                         mknod(where, 0644 | S_IFSOCK, 0);
722                 else if (S_ISREG(source_st.st_mode))
723                         touch(where);
724                 else {
725                         log_error("Refusing to create mountpoint for file: %s", *x);
726                         return -ENOTSUP;
727                 }
728
729                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
730                         log_error("mount(%s) failed: %m", where);
731                         return -errno;
732                 }
733
734                 if (ro) {
735                         r = bind_remount_recursive(where, true);
736                         if (r < 0) {
737                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
738                                 return r;
739                         }
740                 }
741         }
742
743         return 0;
744 }
745
746 static int mount_tmpfs(const char *dest) {
747         char **i, **o;
748
749         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
750                 _cleanup_free_ char *where = NULL;
751
752                 where = strappend(dest, *i);
753                 if (!where)
754                         return log_oom();
755
756                 mkdir_label(where, 0755);
757
758                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
759                         log_error("tmpfs mount to %s failed: %m", where);
760                         return -errno;
761                 }
762         }
763
764         return 0;
765 }
766
767 static int setup_timezone(const char *dest) {
768         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
769         char *z, *y;
770         int r;
771
772         assert(dest);
773
774         /* Fix the timezone, if possible */
775         r = readlink_malloc("/etc/localtime", &p);
776         if (r < 0) {
777                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
778                 return 0;
779         }
780
781         z = path_startswith(p, "../usr/share/zoneinfo/");
782         if (!z)
783                 z = path_startswith(p, "/usr/share/zoneinfo/");
784         if (!z) {
785                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
786                 return 0;
787         }
788
789         where = strappend(dest, "/etc/localtime");
790         if (!where)
791                 return log_oom();
792
793         r = readlink_malloc(where, &q);
794         if (r >= 0) {
795                 y = path_startswith(q, "../usr/share/zoneinfo/");
796                 if (!y)
797                         y = path_startswith(q, "/usr/share/zoneinfo/");
798
799
800                 /* Already pointing to the right place? Then do nothing .. */
801                 if (y && streq(y, z))
802                         return 0;
803         }
804
805         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
806         if (!check)
807                 return log_oom();
808
809         if (access(check, F_OK) < 0) {
810                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
811                 return 0;
812         }
813
814         what = strappend("../usr/share/zoneinfo/", z);
815         if (!what)
816                 return log_oom();
817
818         unlink(where);
819         if (symlink(what, where) < 0) {
820                 log_error("Failed to correct timezone of container: %m");
821                 return 0;
822         }
823
824         return 0;
825 }
826
827 static int setup_resolv_conf(const char *dest) {
828         _cleanup_free_ char *where = NULL;
829
830         assert(dest);
831
832         if (arg_private_network)
833                 return 0;
834
835         /* Fix resolv.conf, if possible */
836         where = strappend(dest, "/etc/resolv.conf");
837         if (!where)
838                 return log_oom();
839
840         /* We don't really care for the results of this really. If it
841          * fails, it fails, but meh... */
842         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
843
844         return 0;
845 }
846
847 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
848
849         snprintf(s, 37,
850                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
851                  SD_ID128_FORMAT_VAL(id));
852
853         return s;
854 }
855
856 static int setup_boot_id(const char *dest) {
857         _cleanup_free_ char *from = NULL, *to = NULL;
858         sd_id128_t rnd = {};
859         char as_uuid[37];
860         int r;
861
862         assert(dest);
863
864         if (arg_share_system)
865                 return 0;
866
867         /* Generate a new randomized boot ID, so that each boot-up of
868          * the container gets a new one */
869
870         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
871         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
872         if (!from || !to)
873                 return log_oom();
874
875         r = sd_id128_randomize(&rnd);
876         if (r < 0) {
877                 log_error("Failed to generate random boot id: %s", strerror(-r));
878                 return r;
879         }
880
881         id128_format_as_uuid(rnd, as_uuid);
882
883         r = write_string_file(from, as_uuid);
884         if (r < 0) {
885                 log_error("Failed to write boot id: %s", strerror(-r));
886                 return r;
887         }
888
889         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
890                 log_error("Failed to bind mount boot id: %m");
891                 r = -errno;
892         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
893                 log_warning("Failed to make boot id read-only: %m");
894
895         unlink(from);
896         return r;
897 }
898
899 static int copy_devnodes(const char *dest) {
900
901         static const char devnodes[] =
902                 "null\0"
903                 "zero\0"
904                 "full\0"
905                 "random\0"
906                 "urandom\0"
907                 "tty\0";
908
909         const char *d;
910         int r = 0;
911         _cleanup_umask_ mode_t u;
912
913         assert(dest);
914
915         u = umask(0000);
916
917         NULSTR_FOREACH(d, devnodes) {
918                 _cleanup_free_ char *from = NULL, *to = NULL;
919                 struct stat st;
920
921                 from = strappend("/dev/", d);
922                 to = strjoin(dest, "/dev/", d, NULL);
923                 if (!from || !to)
924                         return log_oom();
925
926                 if (stat(from, &st) < 0) {
927
928                         if (errno != ENOENT) {
929                                 log_error("Failed to stat %s: %m", from);
930                                 return -errno;
931                         }
932
933                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
934
935                         log_error("%s is not a char or block device, cannot copy", from);
936                         return -EIO;
937
938                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
939
940                         log_error("mknod(%s) failed: %m", dest);
941                         return  -errno;
942                 }
943         }
944
945         return r;
946 }
947
948 static int setup_ptmx(const char *dest) {
949         _cleanup_free_ char *p = NULL;
950
951         p = strappend(dest, "/dev/ptmx");
952         if (!p)
953                 return log_oom();
954
955         if (symlink("pts/ptmx", p) < 0) {
956                 log_error("Failed to create /dev/ptmx symlink: %m");
957                 return -errno;
958         }
959
960         return 0;
961 }
962
963 static int setup_dev_console(const char *dest, const char *console) {
964         _cleanup_umask_ mode_t u;
965         const char *to;
966         struct stat st;
967         int r;
968
969         assert(dest);
970         assert(console);
971
972         u = umask(0000);
973
974         if (stat("/dev/null", &st) < 0) {
975                 log_error("Failed to stat /dev/null: %m");
976                 return -errno;
977         }
978
979         r = chmod_and_chown(console, 0600, 0, 0);
980         if (r < 0) {
981                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
982                 return r;
983         }
984
985         /* We need to bind mount the right tty to /dev/console since
986          * ptys can only exist on pts file systems. To have something
987          * to bind mount things on we create a device node first, and
988          * use /dev/null for that since we the cgroups device policy
989          * allows us to create that freely, while we cannot create
990          * /dev/console. (Note that the major minor doesn't actually
991          * matter here, since we mount it over anyway). */
992
993         to = strappenda(dest, "/dev/console");
994         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
995                 log_error("mknod() for /dev/console failed: %m");
996                 return -errno;
997         }
998
999         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1000                 log_error("Bind mount for /dev/console failed: %m");
1001                 return -errno;
1002         }
1003
1004         return 0;
1005 }
1006
1007 static int setup_kmsg(const char *dest, int kmsg_socket) {
1008         _cleanup_free_ char *from = NULL, *to = NULL;
1009         int r, fd, k;
1010         _cleanup_umask_ mode_t u;
1011         union {
1012                 struct cmsghdr cmsghdr;
1013                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1014         } control = {};
1015         struct msghdr mh = {
1016                 .msg_control = &control,
1017                 .msg_controllen = sizeof(control),
1018         };
1019         struct cmsghdr *cmsg;
1020
1021         assert(dest);
1022         assert(kmsg_socket >= 0);
1023
1024         u = umask(0000);
1025
1026         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1027          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1028          * on the reading side behave very similar to /proc/kmsg,
1029          * their writing side behaves differently from /dev/kmsg in
1030          * that writing blocks when nothing is reading. In order to
1031          * avoid any problems with containers deadlocking due to this
1032          * we simply make /dev/kmsg unavailable to the container. */
1033         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1034             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1035                 return log_oom();
1036
1037         if (mkfifo(from, 0600) < 0) {
1038                 log_error("mkfifo() for /dev/kmsg failed: %m");
1039                 return -errno;
1040         }
1041
1042         r = chmod_and_chown(from, 0600, 0, 0);
1043         if (r < 0) {
1044                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1045                 return r;
1046         }
1047
1048         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1049                 log_error("Bind mount for /proc/kmsg failed: %m");
1050                 return -errno;
1051         }
1052
1053         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1054         if (fd < 0) {
1055                 log_error("Failed to open fifo: %m");
1056                 return -errno;
1057         }
1058
1059         cmsg = CMSG_FIRSTHDR(&mh);
1060         cmsg->cmsg_level = SOL_SOCKET;
1061         cmsg->cmsg_type = SCM_RIGHTS;
1062         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1063         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1064
1065         mh.msg_controllen = cmsg->cmsg_len;
1066
1067         /* Store away the fd in the socket, so that it stays open as
1068          * long as we run the child */
1069         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1070         safe_close(fd);
1071
1072         if (k < 0) {
1073                 log_error("Failed to send FIFO fd: %m");
1074                 return -errno;
1075         }
1076
1077         /* And now make the FIFO unavailable as /dev/kmsg... */
1078         unlink(from);
1079         return 0;
1080 }
1081
1082 static int setup_hostname(void) {
1083
1084         if (arg_share_system)
1085                 return 0;
1086
1087         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1088                 return -errno;
1089
1090         return 0;
1091 }
1092
1093 static int setup_journal(const char *directory) {
1094         sd_id128_t machine_id, this_id;
1095         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1096         char *id;
1097         int r;
1098
1099         p = strappend(directory, "/etc/machine-id");
1100         if (!p)
1101                 return log_oom();
1102
1103         r = read_one_line_file(p, &b);
1104         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1105                 return 0;
1106         else if (r < 0) {
1107                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1108                 return r;
1109         }
1110
1111         id = strstrip(b);
1112         if (isempty(id) && arg_link_journal == LINK_AUTO)
1113                 return 0;
1114
1115         /* Verify validity */
1116         r = sd_id128_from_string(id, &machine_id);
1117         if (r < 0) {
1118                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1119                 return r;
1120         }
1121
1122         r = sd_id128_get_machine(&this_id);
1123         if (r < 0) {
1124                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1125                 return r;
1126         }
1127
1128         if (sd_id128_equal(machine_id, this_id)) {
1129                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1130                          "Host and machine ids are equal (%s): refusing to link journals", id);
1131                 if (arg_link_journal == LINK_AUTO)
1132                         return 0;
1133                 return
1134                         -EEXIST;
1135         }
1136
1137         if (arg_link_journal == LINK_NO)
1138                 return 0;
1139
1140         free(p);
1141         p = strappend("/var/log/journal/", id);
1142         q = strjoin(directory, "/var/log/journal/", id, NULL);
1143         if (!p || !q)
1144                 return log_oom();
1145
1146         if (path_is_mount_point(p, false) > 0) {
1147                 if (arg_link_journal != LINK_AUTO) {
1148                         log_error("%s: already a mount point, refusing to use for journal", p);
1149                         return -EEXIST;
1150                 }
1151
1152                 return 0;
1153         }
1154
1155         if (path_is_mount_point(q, false) > 0) {
1156                 if (arg_link_journal != LINK_AUTO) {
1157                         log_error("%s: already a mount point, refusing to use for journal", q);
1158                         return -EEXIST;
1159                 }
1160
1161                 return 0;
1162         }
1163
1164         r = readlink_and_make_absolute(p, &d);
1165         if (r >= 0) {
1166                 if ((arg_link_journal == LINK_GUEST ||
1167                      arg_link_journal == LINK_AUTO) &&
1168                     path_equal(d, q)) {
1169
1170                         r = mkdir_p(q, 0755);
1171                         if (r < 0)
1172                                 log_warning("failed to create directory %s: %m", q);
1173                         return 0;
1174                 }
1175
1176                 if (unlink(p) < 0) {
1177                         log_error("Failed to remove symlink %s: %m", p);
1178                         return -errno;
1179                 }
1180         } else if (r == -EINVAL) {
1181
1182                 if (arg_link_journal == LINK_GUEST &&
1183                     rmdir(p) < 0) {
1184
1185                         if (errno == ENOTDIR) {
1186                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1187                                 return r;
1188                         } else {
1189                                 log_error("Failed to remove %s: %m", p);
1190                                 return -errno;
1191                         }
1192                 }
1193         } else if (r != -ENOENT) {
1194                 log_error("readlink(%s) failed: %m", p);
1195                 return r;
1196         }
1197
1198         if (arg_link_journal == LINK_GUEST) {
1199
1200                 if (symlink(q, p) < 0) {
1201                         log_error("Failed to symlink %s to %s: %m", q, p);
1202                         return -errno;
1203                 }
1204
1205                 r = mkdir_p(q, 0755);
1206                 if (r < 0)
1207                         log_warning("failed to create directory %s: %m", q);
1208                 return 0;
1209         }
1210
1211         if (arg_link_journal == LINK_HOST) {
1212                 r = mkdir_p(p, 0755);
1213                 if (r < 0) {
1214                         log_error("Failed to create %s: %m", p);
1215                         return r;
1216                 }
1217
1218         } else if (access(p, F_OK) < 0)
1219                 return 0;
1220
1221         if (dir_is_empty(q) == 0)
1222                 log_warning("%s is not empty, proceeding anyway.", q);
1223
1224         r = mkdir_p(q, 0755);
1225         if (r < 0) {
1226                 log_error("Failed to create %s: %m", q);
1227                 return r;
1228         }
1229
1230         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1231                 log_error("Failed to bind mount journal from host into guest: %m");
1232                 return -errno;
1233         }
1234
1235         return 0;
1236 }
1237
1238 static int setup_kdbus(const char *dest, const char *path) {
1239         const char *p;
1240
1241         if (!path)
1242                 return 0;
1243
1244         p = strappenda(dest, "/dev/kdbus");
1245         if (mkdir(p, 0755) < 0) {
1246                 log_error("Failed to create kdbus path: %m");
1247                 return  -errno;
1248         }
1249
1250         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1251                 log_error("Failed to mount kdbus domain path: %m");
1252                 return -errno;
1253         }
1254
1255         return 0;
1256 }
1257
1258 static int drop_capabilities(void) {
1259         return capability_bounding_set_drop(~arg_retain, false);
1260 }
1261
1262 static int register_machine(pid_t pid) {
1263         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1264         _cleanup_bus_unref_ sd_bus *bus = NULL;
1265         int r;
1266
1267         if (!arg_register)
1268                 return 0;
1269
1270         r = sd_bus_default_system(&bus);
1271         if (r < 0) {
1272                 log_error("Failed to open system bus: %s", strerror(-r));
1273                 return r;
1274         }
1275
1276         if (arg_keep_unit) {
1277                 r = sd_bus_call_method(
1278                                 bus,
1279                                 "org.freedesktop.machine1",
1280                                 "/org/freedesktop/machine1",
1281                                 "org.freedesktop.machine1.Manager",
1282                                 "RegisterMachine",
1283                                 &error,
1284                                 NULL,
1285                                 "sayssus",
1286                                 arg_machine,
1287                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1288                                 "nspawn",
1289                                 "container",
1290                                 (uint32_t) pid,
1291                                 strempty(arg_directory));
1292         } else {
1293                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1294
1295                 r = sd_bus_message_new_method_call(
1296                                 bus,
1297                                 &m,
1298                                 "org.freedesktop.machine1",
1299                                 "/org/freedesktop/machine1",
1300                                 "org.freedesktop.machine1.Manager",
1301                                 "CreateMachine");
1302                 if (r < 0) {
1303                         log_error("Failed to create message: %s", strerror(-r));
1304                         return r;
1305                 }
1306
1307                 r = sd_bus_message_append(
1308                                 m,
1309                                 "sayssus",
1310                                 arg_machine,
1311                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1312                                 "nspawn",
1313                                 "container",
1314                                 (uint32_t) pid,
1315                                 strempty(arg_directory));
1316                 if (r < 0) {
1317                         log_error("Failed to append message arguments: %s", strerror(-r));
1318                         return r;
1319                 }
1320
1321                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1322                 if (r < 0) {
1323                         log_error("Failed to open container: %s", strerror(-r));
1324                         return r;
1325                 }
1326
1327                 if (!isempty(arg_slice)) {
1328                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1329                         if (r < 0) {
1330                                 log_error("Failed to append slice: %s", strerror(-r));
1331                                 return r;
1332                         }
1333                 }
1334
1335                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1336                 if (r < 0) {
1337                         log_error("Failed to add device policy: %s", strerror(-r));
1338                         return r;
1339                 }
1340
1341                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1342                                           /* Allow the container to
1343                                            * access and create the API
1344                                            * device nodes, so that
1345                                            * PrivateDevices= in the
1346                                            * container can work
1347                                            * fine */
1348                                           "/dev/null", "rwm",
1349                                           "/dev/zero", "rwm",
1350                                           "/dev/full", "rwm",
1351                                           "/dev/random", "rwm",
1352                                           "/dev/urandom", "rwm",
1353                                           "/dev/tty", "rwm",
1354                                           /* Allow the container
1355                                            * access to ptys. However,
1356                                            * do not permit the
1357                                            * container to ever create
1358                                            * these device nodes. */
1359                                           "/dev/pts/ptmx", "rw",
1360                                           "char-pts", "rw",
1361                                           /* Allow the container
1362                                            * access to all kdbus
1363                                            * devices. Again, the
1364                                            * container cannot create
1365                                            * these nodes, only use
1366                                            * them. We use a pretty
1367                                            * open match here, so that
1368                                            * the kernel API can still
1369                                            * change. */
1370                                           "char-kdbus", "rw",
1371                                           "char-kdbus/*", "rw");
1372                 if (r < 0) {
1373                         log_error("Failed to add device whitelist: %s", strerror(-r));
1374                         return r;
1375                 }
1376
1377                 r = sd_bus_message_close_container(m);
1378                 if (r < 0) {
1379                         log_error("Failed to close container: %s", strerror(-r));
1380                         return r;
1381                 }
1382
1383                 r = sd_bus_call(bus, m, 0, &error, NULL);
1384         }
1385
1386         if (r < 0) {
1387                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1388                 return r;
1389         }
1390
1391         return 0;
1392 }
1393
1394 static int terminate_machine(pid_t pid) {
1395         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1396         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1397         _cleanup_bus_unref_ sd_bus *bus = NULL;
1398         const char *path;
1399         int r;
1400
1401         if (!arg_register)
1402                 return 0;
1403
1404         r = sd_bus_default_system(&bus);
1405         if (r < 0) {
1406                 log_error("Failed to open system bus: %s", strerror(-r));
1407                 return r;
1408         }
1409
1410         r = sd_bus_call_method(
1411                         bus,
1412                         "org.freedesktop.machine1",
1413                         "/org/freedesktop/machine1",
1414                         "org.freedesktop.machine1.Manager",
1415                         "GetMachineByPID",
1416                         &error,
1417                         &reply,
1418                         "u",
1419                         (uint32_t) pid);
1420         if (r < 0) {
1421                 /* Note that the machine might already have been
1422                  * cleaned up automatically, hence don't consider it a
1423                  * failure if we cannot get the machine object. */
1424                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1425                 return 0;
1426         }
1427
1428         r = sd_bus_message_read(reply, "o", &path);
1429         if (r < 0)
1430                 return bus_log_parse_error(r);
1431
1432         r = sd_bus_call_method(
1433                         bus,
1434                         "org.freedesktop.machine1",
1435                         path,
1436                         "org.freedesktop.machine1.Machine",
1437                         "Terminate",
1438                         &error,
1439                         NULL,
1440                         NULL);
1441         if (r < 0) {
1442                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1443                 return 0;
1444         }
1445
1446         return 0;
1447 }
1448
1449 static int reset_audit_loginuid(void) {
1450         _cleanup_free_ char *p = NULL;
1451         int r;
1452
1453         if (arg_share_system)
1454                 return 0;
1455
1456         r = read_one_line_file("/proc/self/loginuid", &p);
1457         if (r == -ENOENT)
1458                 return 0;
1459         if (r < 0) {
1460                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1461                 return r;
1462         }
1463
1464         /* Already reset? */
1465         if (streq(p, "4294967295"))
1466                 return 0;
1467
1468         r = write_string_file("/proc/self/loginuid", "4294967295");
1469         if (r < 0) {
1470                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1471                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1472                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1473                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1474                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1475
1476                 sleep(5);
1477         }
1478
1479         return 0;
1480 }
1481
1482 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1483
1484 static int get_mac(struct ether_addr *mac) {
1485         int r;
1486
1487         uint8_t result[8];
1488         size_t l, sz;
1489         uint8_t *v;
1490
1491         l = strlen(arg_machine);
1492         sz = sizeof(sd_id128_t) + l;
1493         v = alloca(sz);
1494
1495         /* fetch some persistent data unique to the host */
1496         r = sd_id128_get_machine((sd_id128_t*) v);
1497         if (r < 0)
1498                 return r;
1499
1500         /* combine with some data unique (on this host) to this
1501          * container instance */
1502         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1503
1504         /* Let's hash the host machine ID plus the container name. We
1505          * use a fixed, but originally randomly created hash key here. */
1506         siphash24(result, v, sz, HASH_KEY.bytes);
1507
1508         assert_cc(ETH_ALEN <= sizeof(result));
1509         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1510
1511         /* see eth_random_addr in the kernel */
1512         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1513         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1514
1515         return 0;
1516 }
1517
1518 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1519         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1520         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1521         struct ether_addr mac;
1522         int r;
1523
1524         if (!arg_private_network)
1525                 return 0;
1526
1527         if (!arg_network_veth)
1528                 return 0;
1529
1530         /* Use two different interface name prefixes depending whether
1531          * we are in bridge mode or not. */
1532         if (arg_network_bridge)
1533                 memcpy(iface_name, "vb-", 3);
1534         else
1535                 memcpy(iface_name, "ve-", 3);
1536         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1537
1538         r = get_mac(&mac);
1539         if (r < 0) {
1540                 log_error("Failed to generate predictable MAC address for host0");
1541                 return r;
1542         }
1543
1544         r = sd_rtnl_open(&rtnl, 0);
1545         if (r < 0) {
1546                 log_error("Failed to connect to netlink: %s", strerror(-r));
1547                 return r;
1548         }
1549
1550         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1551         if (r < 0) {
1552                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1553                 return r;
1554         }
1555
1556         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1557         if (r < 0) {
1558                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1559                 return r;
1560         }
1561
1562         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1563         if (r < 0) {
1564                 log_error("Failed to open netlink container: %s", strerror(-r));
1565                 return r;
1566         }
1567
1568         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1569         if (r < 0) {
1570                 log_error("Failed to open netlink container: %s", strerror(-r));
1571                 return r;
1572         }
1573
1574         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1575         if (r < 0) {
1576                 log_error("Failed to open netlink container: %s", strerror(-r));
1577                 return r;
1578         }
1579
1580         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1581         if (r < 0) {
1582                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1583                 return r;
1584         }
1585
1586         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1587         if (r < 0) {
1588                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1589                 return r;
1590         }
1591
1592         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1593         if (r < 0) {
1594                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1595                 return r;
1596         }
1597
1598         r = sd_rtnl_message_close_container(m);
1599         if (r < 0) {
1600                 log_error("Failed to close netlink container: %s", strerror(-r));
1601                 return r;
1602         }
1603
1604         r = sd_rtnl_message_close_container(m);
1605         if (r < 0) {
1606                 log_error("Failed to close netlink container: %s", strerror(-r));
1607                 return r;
1608         }
1609
1610         r = sd_rtnl_message_close_container(m);
1611         if (r < 0) {
1612                 log_error("Failed to close netlink container: %s", strerror(-r));
1613                 return r;
1614         }
1615
1616         r = sd_rtnl_call(rtnl, m, 0, NULL);
1617         if (r < 0) {
1618                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1619                 return r;
1620         }
1621
1622         return 0;
1623 }
1624
1625 static int setup_bridge(const char veth_name[]) {
1626         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1627         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1628         int r, bridge;
1629
1630         if (!arg_private_network)
1631                 return 0;
1632
1633         if (!arg_network_veth)
1634                 return 0;
1635
1636         if (!arg_network_bridge)
1637                 return 0;
1638
1639         bridge = (int) if_nametoindex(arg_network_bridge);
1640         if (bridge <= 0) {
1641                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1642                 return -errno;
1643         }
1644
1645         r = sd_rtnl_open(&rtnl, 0);
1646         if (r < 0) {
1647                 log_error("Failed to connect to netlink: %s", strerror(-r));
1648                 return r;
1649         }
1650
1651         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1652         if (r < 0) {
1653                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1654                 return r;
1655         }
1656
1657         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1658         if (r < 0) {
1659                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1660                 return r;
1661         }
1662
1663         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1664         if (r < 0) {
1665                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1666                 return r;
1667         }
1668
1669         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1670         if (r < 0) {
1671                 log_error("Failed to add netlink master field: %s", strerror(-r));
1672                 return r;
1673         }
1674
1675         r = sd_rtnl_call(rtnl, m, 0, NULL);
1676         if (r < 0) {
1677                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1678                 return r;
1679         }
1680
1681         return 0;
1682 }
1683
1684 static int parse_interface(struct udev *udev, const char *name) {
1685         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1686         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1687         int ifi;
1688
1689         ifi = (int) if_nametoindex(name);
1690         if (ifi <= 0) {
1691                 log_error("Failed to resolve interface %s: %m", name);
1692                 return -errno;
1693         }
1694
1695         sprintf(ifi_str, "n%i", ifi);
1696         d = udev_device_new_from_device_id(udev, ifi_str);
1697         if (!d) {
1698                 log_error("Failed to get udev device for interface %s: %m", name);
1699                 return -errno;
1700         }
1701
1702         if (udev_device_get_is_initialized(d) <= 0) {
1703                 log_error("Network interface %s is not initialized yet.", name);
1704                 return -EBUSY;
1705         }
1706
1707         return ifi;
1708 }
1709
1710 static int move_network_interfaces(pid_t pid) {
1711         _cleanup_udev_unref_ struct udev *udev = NULL;
1712         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1713         char **i;
1714         int r;
1715
1716         if (!arg_private_network)
1717                 return 0;
1718
1719         if (strv_isempty(arg_network_interfaces))
1720                 return 0;
1721
1722         r = sd_rtnl_open(&rtnl, 0);
1723         if (r < 0) {
1724                 log_error("Failed to connect to netlink: %s", strerror(-r));
1725                 return r;
1726         }
1727
1728         udev = udev_new();
1729         if (!udev) {
1730                 log_error("Failed to connect to udev.");
1731                 return -ENOMEM;
1732         }
1733
1734         STRV_FOREACH(i, arg_network_interfaces) {
1735                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1736                 int ifi;
1737
1738                 ifi = parse_interface(udev, *i);
1739                 if (ifi < 0)
1740                         return ifi;
1741
1742                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1743                 if (r < 0) {
1744                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1745                         return r;
1746                 }
1747
1748                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1749                 if (r < 0) {
1750                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1751                         return r;
1752                 }
1753
1754                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1755                 if (r < 0) {
1756                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1757                         return r;
1758                 }
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int setup_macvlan(pid_t pid) {
1765         _cleanup_udev_unref_ struct udev *udev = NULL;
1766         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1767         char **i;
1768         int r;
1769
1770         if (!arg_private_network)
1771                 return 0;
1772
1773         if (strv_isempty(arg_network_macvlan))
1774                 return 0;
1775
1776         r = sd_rtnl_open(&rtnl, 0);
1777         if (r < 0) {
1778                 log_error("Failed to connect to netlink: %s", strerror(-r));
1779                 return r;
1780         }
1781
1782         udev = udev_new();
1783         if (!udev) {
1784                 log_error("Failed to connect to udev.");
1785                 return -ENOMEM;
1786         }
1787
1788         STRV_FOREACH(i, arg_network_macvlan) {
1789                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1790                 _cleanup_free_ char *n = NULL;
1791                 int ifi;
1792
1793                 ifi = parse_interface(udev, *i);
1794                 if (ifi < 0)
1795                         return ifi;
1796
1797                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1798                 if (r < 0) {
1799                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1800                         return r;
1801                 }
1802
1803                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1804                 if (r < 0) {
1805                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1806                         return r;
1807                 }
1808
1809                 n = strappend("mv-", *i);
1810                 if (!n)
1811                         return log_oom();
1812
1813                 strshorten(n, IFNAMSIZ-1);
1814
1815                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1816                 if (r < 0) {
1817                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1818                         return r;
1819                 }
1820
1821                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1822                 if (r < 0) {
1823                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1824                         return r;
1825                 }
1826
1827                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1828                 if (r < 0) {
1829                         log_error("Failed to open netlink container: %s", strerror(-r));
1830                         return r;
1831                 }
1832
1833                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1834                 if (r < 0) {
1835                         log_error("Failed to open netlink container: %s", strerror(-r));
1836                         return r;
1837                 }
1838
1839                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1840                 if (r < 0) {
1841                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1842                         return r;
1843                 }
1844
1845                 r = sd_rtnl_message_close_container(m);
1846                 if (r < 0) {
1847                         log_error("Failed to close netlink container: %s", strerror(-r));
1848                         return r;
1849                 }
1850
1851                 r = sd_rtnl_message_close_container(m);
1852                 if (r < 0) {
1853                         log_error("Failed to close netlink container: %s", strerror(-r));
1854                         return r;
1855                 }
1856
1857                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1858                 if (r < 0) {
1859                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1860                         return r;
1861                 }
1862         }
1863
1864         return 0;
1865 }
1866
1867 static int setup_seccomp(void) {
1868
1869 #ifdef HAVE_SECCOMP
1870         static const int blacklist[] = {
1871                 SCMP_SYS(kexec_load),
1872                 SCMP_SYS(open_by_handle_at),
1873                 SCMP_SYS(init_module),
1874                 SCMP_SYS(finit_module),
1875                 SCMP_SYS(delete_module),
1876                 SCMP_SYS(iopl),
1877                 SCMP_SYS(ioperm),
1878                 SCMP_SYS(swapon),
1879                 SCMP_SYS(swapoff),
1880         };
1881
1882         scmp_filter_ctx seccomp;
1883         unsigned i;
1884         int r;
1885
1886         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1887         if (!seccomp)
1888                 return log_oom();
1889
1890         r = seccomp_add_secondary_archs(seccomp);
1891         if (r < 0) {
1892                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1893                 goto finish;
1894         }
1895
1896         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1897                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1898                 if (r == -EFAULT)
1899                         continue; /* unknown syscall */
1900                 if (r < 0) {
1901                         log_error("Failed to block syscall: %s", strerror(-r));
1902                         goto finish;
1903                 }
1904         }
1905
1906         /*
1907            Audit is broken in containers, much of the userspace audit
1908            hookup will fail if running inside a container. We don't
1909            care and just turn off creation of audit sockets.
1910
1911            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1912            with EAFNOSUPPORT which audit userspace uses as indication
1913            that audit is disabled in the kernel.
1914          */
1915
1916         r = seccomp_rule_add(
1917                         seccomp,
1918                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1919                         SCMP_SYS(socket),
1920                         2,
1921                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1922                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1923         if (r < 0) {
1924                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1925                 goto finish;
1926         }
1927
1928         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1929         if (r < 0) {
1930                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1931                 goto finish;
1932         }
1933
1934         r = seccomp_load(seccomp);
1935         if (r < 0)
1936                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1937
1938 finish:
1939         seccomp_release(seccomp);
1940         return r;
1941 #else
1942         return 0;
1943 #endif
1944
1945 }
1946
1947 static int setup_image(char **device_path, int *loop_nr) {
1948         struct loop_info64 info = {
1949                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1950         };
1951         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1952         _cleanup_free_ char* loopdev = NULL;
1953         struct stat st;
1954         int r, nr;
1955
1956         assert(device_path);
1957         assert(loop_nr);
1958
1959         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1960         if (fd < 0) {
1961                 log_error("Failed to open %s: %m", arg_image);
1962                 return -errno;
1963         }
1964
1965         if (fstat(fd, &st) < 0) {
1966                 log_error("Failed to stat %s: %m", arg_image);
1967                 return -errno;
1968         }
1969
1970         if (S_ISBLK(st.st_mode)) {
1971                 char *p;
1972
1973                 p = strdup(arg_image);
1974                 if (!p)
1975                         return log_oom();
1976
1977                 *device_path = p;
1978
1979                 *loop_nr = -1;
1980
1981                 r = fd;
1982                 fd = -1;
1983
1984                 return r;
1985         }
1986
1987         if (!S_ISREG(st.st_mode)) {
1988                 log_error("%s is not a regular file or block device: %m", arg_image);
1989                 return -EINVAL;
1990         }
1991
1992         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1993         if (control < 0) {
1994                 log_error("Failed to open /dev/loop-control: %m");
1995                 return -errno;
1996         }
1997
1998         nr = ioctl(control, LOOP_CTL_GET_FREE);
1999         if (nr < 0) {
2000                 log_error("Failed to allocate loop device: %m");
2001                 return -errno;
2002         }
2003
2004         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2005                 return log_oom();
2006
2007         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2008         if (loop < 0) {
2009                 log_error("Failed to open loop device %s: %m", loopdev);
2010                 return -errno;
2011         }
2012
2013         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2014                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2015                 return -errno;
2016         }
2017
2018         if (arg_read_only)
2019                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2020
2021         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2022                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2023                 return -errno;
2024         }
2025
2026         *device_path = loopdev;
2027         loopdev = NULL;
2028
2029         *loop_nr = nr;
2030
2031         r = loop;
2032         loop = -1;
2033
2034         return r;
2035 }
2036
2037 static int dissect_image(
2038                 int fd,
2039                 char **root_device, bool *root_device_rw,
2040                 char **home_device, bool *home_device_rw,
2041                 char **srv_device, bool *srv_device_rw,
2042                 bool *secondary) {
2043
2044 #ifdef HAVE_BLKID
2045         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2046         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2047         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2048         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2049         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2050         _cleanup_udev_unref_ struct udev *udev = NULL;
2051         struct udev_list_entry *first, *item;
2052         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2053         const char *pttype = NULL;
2054         blkid_partlist pl;
2055         struct stat st;
2056         int r;
2057
2058         assert(fd >= 0);
2059         assert(root_device);
2060         assert(home_device);
2061         assert(srv_device);
2062         assert(secondary);
2063
2064         b = blkid_new_probe();
2065         if (!b)
2066                 return log_oom();
2067
2068         errno = 0;
2069         r = blkid_probe_set_device(b, fd, 0, 0);
2070         if (r != 0) {
2071                 if (errno == 0)
2072                         return log_oom();
2073
2074                 log_error("Failed to set device on blkid probe: %m");
2075                 return -errno;
2076         }
2077
2078         blkid_probe_enable_partitions(b, 1);
2079         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2080
2081         errno = 0;
2082         r = blkid_do_safeprobe(b);
2083         if (r == -2 || r == 1) {
2084                 log_error("Failed to identify any partition table on %s.\n"
2085                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2086                 return -EINVAL;
2087         } else if (r != 0) {
2088                 if (errno == 0)
2089                         errno = EIO;
2090                 log_error("Failed to probe: %m");
2091                 return -errno;
2092         }
2093
2094         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2095         if (!streq_ptr(pttype, "gpt")) {
2096                 log_error("Image %s does not carry a GUID Partition Table.\n"
2097                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2098                 return -EINVAL;
2099         }
2100
2101         errno = 0;
2102         pl = blkid_probe_get_partitions(b);
2103         if (!pl) {
2104                 if (errno == 0)
2105                         return log_oom();
2106
2107                 log_error("Failed to list partitions of %s", arg_image);
2108                 return -errno;
2109         }
2110
2111         udev = udev_new();
2112         if (!udev)
2113                 return log_oom();
2114
2115         if (fstat(fd, &st) < 0) {
2116                 log_error("Failed to stat block device: %m");
2117                 return -errno;
2118         }
2119
2120         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2121         if (!d)
2122                 return log_oom();
2123
2124         e = udev_enumerate_new(udev);
2125         if (!e)
2126                 return log_oom();
2127
2128         r = udev_enumerate_add_match_parent(e, d);
2129         if (r < 0)
2130                 return log_oom();
2131
2132         r = udev_enumerate_scan_devices(e);
2133         if (r < 0) {
2134                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2135                 return r;
2136         }
2137
2138         first = udev_enumerate_get_list_entry(e);
2139         udev_list_entry_foreach(item, first) {
2140                 _cleanup_udev_device_unref_ struct udev_device *q;
2141                 const char *stype, *node;
2142                 unsigned long long flags;
2143                 sd_id128_t type_id;
2144                 blkid_partition pp;
2145                 dev_t qn;
2146                 int nr;
2147
2148                 errno = 0;
2149                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2150                 if (!q) {
2151                         if (!errno)
2152                                 errno = ENOMEM;
2153
2154                         log_error("Failed to get partition device of %s: %m", arg_image);
2155                         return -errno;
2156                 }
2157
2158                 qn = udev_device_get_devnum(q);
2159                 if (major(qn) == 0)
2160                         continue;
2161
2162                 if (st.st_rdev == qn)
2163                         continue;
2164
2165                 node = udev_device_get_devnode(q);
2166                 if (!node)
2167                         continue;
2168
2169                 pp = blkid_partlist_devno_to_partition(pl, qn);
2170                 if (!pp)
2171                         continue;
2172
2173                 flags = blkid_partition_get_flags(pp);
2174                 if (flags & GPT_FLAG_NO_AUTO)
2175                         continue;
2176
2177                 nr = blkid_partition_get_partno(pp);
2178                 if (nr < 0)
2179                         continue;
2180
2181                 stype = blkid_partition_get_type_string(pp);
2182                 if (!stype)
2183                         continue;
2184
2185                 if (sd_id128_from_string(stype, &type_id) < 0)
2186                         continue;
2187
2188                 if (sd_id128_equal(type_id, GPT_HOME)) {
2189
2190                         if (home && nr >= home_nr)
2191                                 continue;
2192
2193                         home_nr = nr;
2194                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2195
2196                         free(home);
2197                         home = strdup(node);
2198                         if (!home)
2199                                 return log_oom();
2200                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2201
2202                         if (srv && nr >= srv_nr)
2203                                 continue;
2204
2205                         srv_nr = nr;
2206                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2207
2208                         free(srv);
2209                         srv = strdup(node);
2210                         if (!srv)
2211                                 return log_oom();
2212                 }
2213 #ifdef GPT_ROOT_NATIVE
2214                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2215
2216                         if (root && nr >= root_nr)
2217                                 continue;
2218
2219                         root_nr = nr;
2220                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2221
2222                         free(root);
2223                         root = strdup(node);
2224                         if (!root)
2225                                 return log_oom();
2226                 }
2227 #endif
2228 #ifdef GPT_ROOT_SECONDARY
2229                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2230
2231                         if (secondary_root && nr >= secondary_root_nr)
2232                                 continue;
2233
2234                         secondary_root_nr = nr;
2235                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2236
2237
2238                         free(secondary_root);
2239                         secondary_root = strdup(node);
2240                         if (!secondary_root)
2241                                 return log_oom();
2242                 }
2243 #endif
2244         }
2245
2246         if (!root && !secondary_root) {
2247                 log_error("Failed to identify root partition in disk image %s.\n"
2248                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2249                 return -EINVAL;
2250         }
2251
2252         if (root) {
2253                 *root_device = root;
2254                 root = NULL;
2255
2256                 *root_device_rw = root_rw;
2257                 *secondary = false;
2258         } else if (secondary_root) {
2259                 *root_device = secondary_root;
2260                 secondary_root = NULL;
2261
2262                 *root_device_rw = secondary_root_rw;
2263                 *secondary = true;
2264         }
2265
2266         if (home) {
2267                 *home_device = home;
2268                 home = NULL;
2269
2270                 *home_device_rw = home_rw;
2271         }
2272
2273         if (srv) {
2274                 *srv_device = srv;
2275                 srv = NULL;
2276
2277                 *srv_device_rw = srv_rw;
2278         }
2279
2280         return 0;
2281 #else
2282         log_error("--image= is not supported, compiled without blkid support.");
2283         return -ENOTSUP;
2284 #endif
2285 }
2286
2287 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2288 #ifdef HAVE_BLKID
2289         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2290         const char *fstype, *p;
2291         int r;
2292
2293         assert(what);
2294         assert(where);
2295
2296         if (arg_read_only)
2297                 rw = false;
2298
2299         if (directory)
2300                 p = strappenda(where, directory);
2301         else
2302                 p = where;
2303
2304         errno = 0;
2305         b = blkid_new_probe_from_filename(what);
2306         if (!b) {
2307                 if (errno == 0)
2308                         return log_oom();
2309                 log_error("Failed to allocate prober for %s: %m", what);
2310                 return -errno;
2311         }
2312
2313         blkid_probe_enable_superblocks(b, 1);
2314         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2315
2316         errno = 0;
2317         r = blkid_do_safeprobe(b);
2318         if (r == -1 || r == 1) {
2319                 log_error("Cannot determine file system type of %s", what);
2320                 return -EINVAL;
2321         } else if (r != 0) {
2322                 if (errno == 0)
2323                         errno = EIO;
2324                 log_error("Failed to probe %s: %m", what);
2325                 return -errno;
2326         }
2327
2328         errno = 0;
2329         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2330                 if (errno == 0)
2331                         errno = EINVAL;
2332                 log_error("Failed to determine file system type of %s", what);
2333                 return -errno;
2334         }
2335
2336         if (streq(fstype, "crypto_LUKS")) {
2337                 log_error("nspawn currently does not support LUKS disk images.");
2338                 return -ENOTSUP;
2339         }
2340
2341         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2342                 log_error("Failed to mount %s: %m", what);
2343                 return -errno;
2344         }
2345
2346         return 0;
2347 #else
2348         log_error("--image= is not supported, compiled without blkid support.");
2349         return -ENOTSUP;
2350 #endif
2351 }
2352
2353 static int mount_devices(
2354                 const char *where,
2355                 const char *root_device, bool root_device_rw,
2356                 const char *home_device, bool home_device_rw,
2357                 const char *srv_device, bool srv_device_rw) {
2358         int r;
2359
2360         assert(where);
2361
2362         if (root_device) {
2363                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2364                 if (r < 0) {
2365                         log_error("Failed to mount root directory: %s", strerror(-r));
2366                         return r;
2367                 }
2368         }
2369
2370         if (home_device) {
2371                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2372                 if (r < 0) {
2373                         log_error("Failed to mount home directory: %s", strerror(-r));
2374                         return r;
2375                 }
2376         }
2377
2378         if (srv_device) {
2379                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2380                 if (r < 0) {
2381                         log_error("Failed to mount server data directory: %s", strerror(-r));
2382                         return r;
2383                 }
2384         }
2385
2386         return 0;
2387 }
2388
2389 static void loop_remove(int nr, int *image_fd) {
2390         _cleanup_close_ int control = -1;
2391
2392         if (nr < 0)
2393                 return;
2394
2395         if (image_fd && *image_fd >= 0) {
2396                 ioctl(*image_fd, LOOP_CLR_FD);
2397                 *image_fd = safe_close(*image_fd);
2398         }
2399
2400         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2401         if (control < 0)
2402                 return;
2403
2404         ioctl(control, LOOP_CTL_REMOVE, nr);
2405 }
2406
2407 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2408         int pipe_fds[2];
2409         pid_t pid;
2410
2411         assert(database);
2412         assert(key);
2413         assert(rpid);
2414
2415         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2416                 log_error("Failed to allocate pipe: %m");
2417                 return -errno;
2418         }
2419
2420         pid = fork();
2421         if (pid < 0) {
2422                 log_error("Failed to fork getent child: %m");
2423                 return -errno;
2424         } else if (pid == 0) {
2425                 int nullfd;
2426                 char *empty_env = NULL;
2427
2428                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2429                         _exit(EXIT_FAILURE);
2430
2431                 if (pipe_fds[0] > 2)
2432                         safe_close(pipe_fds[0]);
2433                 if (pipe_fds[1] > 2)
2434                         safe_close(pipe_fds[1]);
2435
2436                 nullfd = open("/dev/null", O_RDWR);
2437                 if (nullfd < 0)
2438                         _exit(EXIT_FAILURE);
2439
2440                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2441                         _exit(EXIT_FAILURE);
2442
2443                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2444                         _exit(EXIT_FAILURE);
2445
2446                 if (nullfd > 2)
2447                         safe_close(nullfd);
2448
2449                 reset_all_signal_handlers();
2450                 close_all_fds(NULL, 0);
2451
2452                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2453                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2454                 _exit(EXIT_FAILURE);
2455         }
2456
2457         pipe_fds[1] = safe_close(pipe_fds[1]);
2458
2459         *rpid = pid;
2460
2461         return pipe_fds[0];
2462 }
2463
2464 static int change_uid_gid(char **_home) {
2465         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2466         _cleanup_free_ uid_t *uids = NULL;
2467         _cleanup_free_ char *home = NULL;
2468         _cleanup_fclose_ FILE *f = NULL;
2469         _cleanup_close_ int fd = -1;
2470         unsigned n_uids = 0;
2471         size_t sz = 0, l;
2472         uid_t uid;
2473         gid_t gid;
2474         pid_t pid;
2475         int r;
2476
2477         assert(_home);
2478
2479         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2480                 /* Reset everything fully to 0, just in case */
2481
2482                 if (setgroups(0, NULL) < 0) {
2483                         log_error("setgroups() failed: %m");
2484                         return -errno;
2485                 }
2486
2487                 if (setresgid(0, 0, 0) < 0) {
2488                         log_error("setregid() failed: %m");
2489                         return -errno;
2490                 }
2491
2492                 if (setresuid(0, 0, 0) < 0) {
2493                         log_error("setreuid() failed: %m");
2494                         return -errno;
2495                 }
2496
2497                 *_home = NULL;
2498                 return 0;
2499         }
2500
2501         /* First, get user credentials */
2502         fd = spawn_getent("passwd", arg_user, &pid);
2503         if (fd < 0)
2504                 return fd;
2505
2506         f = fdopen(fd, "r");
2507         if (!f)
2508                 return log_oom();
2509         fd = -1;
2510
2511         if (!fgets(line, sizeof(line), f)) {
2512
2513                 if (!ferror(f)) {
2514                         log_error("Failed to resolve user %s.", arg_user);
2515                         return -ESRCH;
2516                 }
2517
2518                 log_error("Failed to read from getent: %m");
2519                 return -errno;
2520         }
2521
2522         truncate_nl(line);
2523
2524         wait_for_terminate_and_warn("getent passwd", pid);
2525
2526         x = strchr(line, ':');
2527         if (!x) {
2528                 log_error("/etc/passwd entry has invalid user field.");
2529                 return -EIO;
2530         }
2531
2532         u = strchr(x+1, ':');
2533         if (!u) {
2534                 log_error("/etc/passwd entry has invalid password field.");
2535                 return -EIO;
2536         }
2537
2538         u++;
2539         g = strchr(u, ':');
2540         if (!g) {
2541                 log_error("/etc/passwd entry has invalid UID field.");
2542                 return -EIO;
2543         }
2544
2545         *g = 0;
2546         g++;
2547         x = strchr(g, ':');
2548         if (!x) {
2549                 log_error("/etc/passwd entry has invalid GID field.");
2550                 return -EIO;
2551         }
2552
2553         *x = 0;
2554         h = strchr(x+1, ':');
2555         if (!h) {
2556                 log_error("/etc/passwd entry has invalid GECOS field.");
2557                 return -EIO;
2558         }
2559
2560         h++;
2561         x = strchr(h, ':');
2562         if (!x) {
2563                 log_error("/etc/passwd entry has invalid home directory field.");
2564                 return -EIO;
2565         }
2566
2567         *x = 0;
2568
2569         r = parse_uid(u, &uid);
2570         if (r < 0) {
2571                 log_error("Failed to parse UID of user.");
2572                 return -EIO;
2573         }
2574
2575         r = parse_gid(g, &gid);
2576         if (r < 0) {
2577                 log_error("Failed to parse GID of user.");
2578                 return -EIO;
2579         }
2580
2581         home = strdup(h);
2582         if (!home)
2583                 return log_oom();
2584
2585         /* Second, get group memberships */
2586         fd = spawn_getent("initgroups", arg_user, &pid);
2587         if (fd < 0)
2588                 return fd;
2589
2590         fclose(f);
2591         f = fdopen(fd, "r");
2592         if (!f)
2593                 return log_oom();
2594         fd = -1;
2595
2596         if (!fgets(line, sizeof(line), f)) {
2597                 if (!ferror(f)) {
2598                         log_error("Failed to resolve user %s.", arg_user);
2599                         return -ESRCH;
2600                 }
2601
2602                 log_error("Failed to read from getent: %m");
2603                 return -errno;
2604         }
2605
2606         truncate_nl(line);
2607
2608         wait_for_terminate_and_warn("getent initgroups", pid);
2609
2610         /* Skip over the username and subsequent separator whitespace */
2611         x = line;
2612         x += strcspn(x, WHITESPACE);
2613         x += strspn(x, WHITESPACE);
2614
2615         FOREACH_WORD(w, l, x, state) {
2616                 char c[l+1];
2617
2618                 memcpy(c, w, l);
2619                 c[l] = 0;
2620
2621                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2622                         return log_oom();
2623
2624                 r = parse_uid(c, &uids[n_uids++]);
2625                 if (r < 0) {
2626                         log_error("Failed to parse group data from getent.");
2627                         return -EIO;
2628                 }
2629         }
2630
2631         r = mkdir_parents(home, 0775);
2632         if (r < 0) {
2633                 log_error("Failed to make home root directory: %s", strerror(-r));
2634                 return r;
2635         }
2636
2637         r = mkdir_safe(home, 0755, uid, gid);
2638         if (r < 0 && r != -EEXIST) {
2639                 log_error("Failed to make home directory: %s", strerror(-r));
2640                 return r;
2641         }
2642
2643         fchown(STDIN_FILENO, uid, gid);
2644         fchown(STDOUT_FILENO, uid, gid);
2645         fchown(STDERR_FILENO, uid, gid);
2646
2647         if (setgroups(n_uids, uids) < 0) {
2648                 log_error("Failed to set auxiliary groups: %m");
2649                 return -errno;
2650         }
2651
2652         if (setresgid(gid, gid, gid) < 0) {
2653                 log_error("setregid() failed: %m");
2654                 return -errno;
2655         }
2656
2657         if (setresuid(uid, uid, uid) < 0) {
2658                 log_error("setreuid() failed: %m");
2659                 return -errno;
2660         }
2661
2662         if (_home) {
2663                 *_home = home;
2664                 home = NULL;
2665         }
2666
2667         return 0;
2668 }
2669
2670 /*
2671  * Return values:
2672  * < 0 : wait_for_terminate() failed to get the state of the
2673  *       container, the container was terminated by a signal, or
2674  *       failed for an unknown reason.  No change is made to the
2675  *       container argument.
2676  * > 0 : The program executed in the container terminated with an
2677  *       error.  The exit code of the program executed in the
2678  *       container is returned.  No change is made to the container
2679  *       argument.
2680  *   0 : The container is being rebooted, has been shut down or exited
2681  *       successfully.  The container argument has been set to either
2682  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2683  *
2684  * That is, success is indicated by a return value of zero, and an
2685  * error is indicated by a non-zero value.
2686  */
2687 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2688         int r;
2689         siginfo_t status;
2690
2691         r = wait_for_terminate(pid, &status);
2692         if (r < 0) {
2693                 log_warning("Failed to wait for container: %s", strerror(-r));
2694                 return r;
2695         }
2696
2697         switch (status.si_code) {
2698         case CLD_EXITED:
2699                 r = status.si_status;
2700                 if (r == 0) {
2701                         if (!arg_quiet)
2702                                 log_debug("Container %s exited successfully.",
2703                                           arg_machine);
2704
2705                         *container = CONTAINER_TERMINATED;
2706                 } else {
2707                         log_error("Container %s failed with error code %i.",
2708                                   arg_machine, status.si_status);
2709                 }
2710                 break;
2711
2712         case CLD_KILLED:
2713                 if (status.si_status == SIGINT) {
2714                         if (!arg_quiet)
2715                                 log_info("Container %s has been shut down.",
2716                                          arg_machine);
2717
2718                         *container = CONTAINER_TERMINATED;
2719                         r = 0;
2720                         break;
2721                 } else if (status.si_status == SIGHUP) {
2722                         if (!arg_quiet)
2723                                 log_info("Container %s is being rebooted.",
2724                                          arg_machine);
2725
2726                         *container = CONTAINER_REBOOTED;
2727                         r = 0;
2728                         break;
2729                 }
2730                 /* CLD_KILLED fallthrough */
2731
2732         case CLD_DUMPED:
2733                 log_error("Container %s terminated by signal %s.",
2734                           arg_machine, signal_to_string(status.si_status));
2735                 r = -1;
2736                 break;
2737
2738         default:
2739                 log_error("Container %s failed due to unknown reason.",
2740                           arg_machine);
2741                 r = -1;
2742                 break;
2743         }
2744
2745         return r;
2746 }
2747
2748 static void nop_handler(int sig) {}
2749
2750 int main(int argc, char *argv[]) {
2751
2752         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2753         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2754         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2755         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2756         _cleanup_fdset_free_ FDSet *fds = NULL;
2757         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2758         const char *console = NULL;
2759         char veth_name[IFNAMSIZ];
2760         bool secondary = false;
2761         sigset_t mask, mask_chld;
2762         pid_t pid = 0;
2763
2764         log_parse_environment();
2765         log_open();
2766
2767         k = parse_argv(argc, argv);
2768         if (k < 0)
2769                 goto finish;
2770         else if (k == 0) {
2771                 r = EXIT_SUCCESS;
2772                 goto finish;
2773         }
2774
2775         if (!arg_image) {
2776                 if (arg_directory) {
2777                         char *p;
2778
2779                         p = path_make_absolute_cwd(arg_directory);
2780                         free(arg_directory);
2781                         arg_directory = p;
2782                 } else
2783                         arg_directory = get_current_dir_name();
2784
2785                 if (!arg_directory) {
2786                         log_error("Failed to determine path, please use -D.");
2787                         goto finish;
2788                 }
2789                 path_kill_slashes(arg_directory);
2790         }
2791
2792         if (!arg_machine) {
2793                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2794                 if (!arg_machine) {
2795                         log_oom();
2796                         goto finish;
2797                 }
2798
2799                 hostname_cleanup(arg_machine, false);
2800                 if (isempty(arg_machine)) {
2801                         log_error("Failed to determine machine name automatically, please use -M.");
2802                         goto finish;
2803                 }
2804         }
2805
2806         if (geteuid() != 0) {
2807                 log_error("Need to be root.");
2808                 goto finish;
2809         }
2810
2811         if (sd_booted() <= 0) {
2812                 log_error("Not running on a systemd system.");
2813                 goto finish;
2814         }
2815
2816         log_close();
2817         n_fd_passed = sd_listen_fds(false);
2818         if (n_fd_passed > 0) {
2819                 k = fdset_new_listen_fds(&fds, false);
2820                 if (k < 0) {
2821                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2822                         goto finish;
2823                 }
2824         }
2825         fdset_close_others(fds);
2826         log_open();
2827
2828         if (arg_directory) {
2829                 if (path_equal(arg_directory, "/")) {
2830                         log_error("Spawning container on root directory not supported.");
2831                         goto finish;
2832                 }
2833
2834                 if (arg_boot) {
2835                         if (path_is_os_tree(arg_directory) <= 0) {
2836                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2837                                 goto finish;
2838                         }
2839                 } else {
2840                         const char *p;
2841
2842                         p = strappenda(arg_directory,
2843                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2844                         if (access(p, F_OK) < 0) {
2845                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2846                                 goto finish;
2847
2848                         }
2849                 }
2850         } else {
2851                 char template[] = "/tmp/nspawn-root-XXXXXX";
2852
2853                 if (!mkdtemp(template)) {
2854                         log_error("Failed to create temporary directory: %m");
2855                         r = -errno;
2856                         goto finish;
2857                 }
2858
2859                 arg_directory = strdup(template);
2860                 if (!arg_directory) {
2861                         r = log_oom();
2862                         goto finish;
2863                 }
2864
2865                 image_fd = setup_image(&device_path, &loop_nr);
2866                 if (image_fd < 0) {
2867                         r = image_fd;
2868                         goto finish;
2869                 }
2870
2871                 r = dissect_image(image_fd, &root_device, &root_device_rw, &home_device, &home_device_rw, &srv_device, &srv_device_rw, &secondary);
2872                 if (r < 0)
2873                         goto finish;
2874         }
2875
2876         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2877         if (master < 0) {
2878                 log_error("Failed to acquire pseudo tty: %m");
2879                 goto finish;
2880         }
2881
2882         console = ptsname(master);
2883         if (!console) {
2884                 log_error("Failed to determine tty name: %m");
2885                 goto finish;
2886         }
2887
2888         if (!arg_quiet)
2889                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2890                          arg_machine, arg_image ? arg_image : arg_directory);
2891
2892         if (unlockpt(master) < 0) {
2893                 log_error("Failed to unlock tty: %m");
2894                 goto finish;
2895         }
2896
2897         if (access("/dev/kdbus/control", F_OK) >= 0) {
2898
2899                 if (arg_share_system) {
2900                         kdbus_domain = strdup("/dev/kdbus");
2901                         if (!kdbus_domain) {
2902                                 log_oom();
2903                                 goto finish;
2904                         }
2905                 } else {
2906                         const char *ns;
2907
2908                         ns = strappenda("machine-", arg_machine);
2909                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2910                         if (r < 0)
2911                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2912                         else
2913                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2914                 }
2915         }
2916
2917         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2918                 log_error("Failed to create kmsg socket pair: %m");
2919                 goto finish;
2920         }
2921
2922         sd_notify(0, "READY=1");
2923
2924         assert_se(sigemptyset(&mask) == 0);
2925         assert_se(sigemptyset(&mask_chld) == 0);
2926         sigaddset(&mask_chld, SIGCHLD);
2927         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2928         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2929
2930         for (;;) {
2931                 ContainerStatus container_status;
2932                 int eventfds[2] = { -1, -1 };
2933                 struct sigaction sa = {
2934                         .sa_handler = nop_handler,
2935                         .sa_flags = SA_NOCLDSTOP,
2936                 };
2937
2938                 /* Child can be killed before execv(), so handle SIGCHLD
2939                  * in order to interrupt parent's blocking calls and
2940                  * give it a chance to call wait() and terminate. */
2941                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2942                 if (r < 0) {
2943                         log_error("Failed to change the signal mask: %m");
2944                         goto finish;
2945                 }
2946
2947                 r = sigaction(SIGCHLD, &sa, NULL);
2948                 if (r < 0) {
2949                         log_error("Failed to install SIGCHLD handler: %m");
2950                         goto finish;
2951                 }
2952
2953                 pid = clone_with_eventfd(SIGCHLD|CLONE_NEWNS|
2954                                          (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2955                                          (arg_private_network ? CLONE_NEWNET : 0), eventfds);
2956                 if (pid < 0) {
2957                         if (errno == EINVAL)
2958                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2959                         else
2960                                 log_error("clone() failed: %m");
2961
2962                         r = pid;
2963                         goto finish;
2964                 }
2965
2966                 if (pid == 0) {
2967                         /* child */
2968                         _cleanup_free_ char *home = NULL;
2969                         unsigned n_env = 2;
2970                         const char *envp[] = {
2971                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2972                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2973                                 NULL, /* TERM */
2974                                 NULL, /* HOME */
2975                                 NULL, /* USER */
2976                                 NULL, /* LOGNAME */
2977                                 NULL, /* container_uuid */
2978                                 NULL, /* LISTEN_FDS */
2979                                 NULL, /* LISTEN_PID */
2980                                 NULL
2981                         };
2982                         char **env_use;
2983
2984                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2985                         if (envp[n_env])
2986                                 n_env ++;
2987
2988                         master = safe_close(master);
2989
2990                         close_nointr(STDIN_FILENO);
2991                         close_nointr(STDOUT_FILENO);
2992                         close_nointr(STDERR_FILENO);
2993
2994                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
2995
2996                         reset_all_signal_handlers();
2997
2998                         assert_se(sigemptyset(&mask) == 0);
2999                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3000
3001                         k = open_terminal(console, O_RDWR);
3002                         if (k != STDIN_FILENO) {
3003                                 if (k >= 0) {
3004                                         safe_close(k);
3005                                         k = -EINVAL;
3006                                 }
3007
3008                                 log_error("Failed to open console: %s", strerror(-k));
3009                                 goto child_fail;
3010                         }
3011
3012                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3013                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3014                                 log_error("Failed to duplicate console: %m");
3015                                 goto child_fail;
3016                         }
3017
3018                         if (setsid() < 0) {
3019                                 log_error("setsid() failed: %m");
3020                                 goto child_fail;
3021                         }
3022
3023                         if (reset_audit_loginuid() < 0)
3024                                 goto child_fail;
3025
3026                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3027                                 log_error("PR_SET_PDEATHSIG failed: %m");
3028                                 goto child_fail;
3029                         }
3030
3031                         /* Mark everything as slave, so that we still
3032                          * receive mounts from the real root, but don't
3033                          * propagate mounts to the real root. */
3034                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3035                                 log_error("MS_SLAVE|MS_REC failed: %m");
3036                                 goto child_fail;
3037                         }
3038
3039                         if (mount_devices(arg_directory,
3040                                           root_device, root_device_rw,
3041                                           home_device, home_device_rw,
3042                                           srv_device, srv_device_rw) < 0)
3043                                 goto child_fail;
3044
3045                         r = base_filesystem_create(arg_directory);
3046                         if (r < 0) {
3047                                 log_error("Failed to create the base filesystem: %s", strerror(-r));
3048                                 goto child_fail;
3049                         }
3050
3051                         /* Turn directory into bind mount */
3052                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3053                                 log_error("Failed to make bind mount: %m");
3054                                 goto child_fail;
3055                         }
3056
3057                         if (arg_read_only) {
3058                                 k = bind_remount_recursive(arg_directory, true);
3059                                 if (k < 0) {
3060                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3061                                         goto child_fail;
3062                                 }
3063                         }
3064
3065                         if (mount_all(arg_directory) < 0)
3066                                 goto child_fail;
3067
3068                         if (copy_devnodes(arg_directory) < 0)
3069                                 goto child_fail;
3070
3071                         if (setup_ptmx(arg_directory) < 0)
3072                                 goto child_fail;
3073
3074                         dev_setup(arg_directory);
3075
3076                         if (setup_seccomp() < 0)
3077                                 goto child_fail;
3078
3079                         if (setup_dev_console(arg_directory, console) < 0)
3080                                 goto child_fail;
3081
3082                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3083                                 goto child_fail;
3084
3085                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3086
3087                         if (setup_boot_id(arg_directory) < 0)
3088                                 goto child_fail;
3089
3090                         if (setup_timezone(arg_directory) < 0)
3091                                 goto child_fail;
3092
3093                         if (setup_resolv_conf(arg_directory) < 0)
3094                                 goto child_fail;
3095
3096                         if (setup_journal(arg_directory) < 0)
3097                                 goto child_fail;
3098
3099                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3100                                 goto child_fail;
3101
3102                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3103                                 goto child_fail;
3104
3105                         if (mount_tmpfs(arg_directory) < 0)
3106                                 goto child_fail;
3107
3108                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3109                                 goto child_fail;
3110
3111                         /* Tell the parent that we are ready, and that
3112                          * it can cgroupify us to that we lack access
3113                          * to certain devices and resources. */
3114                         r = eventfd_send_state(eventfds[1],
3115                                                EVENTFD_CHILD_SUCCEEDED);
3116                         eventfds[1] = safe_close(eventfds[1]);
3117                         if (r < 0)
3118                                 goto child_fail;
3119
3120                         if (chdir(arg_directory) < 0) {
3121                                 log_error("chdir(%s) failed: %m", arg_directory);
3122                                 goto child_fail;
3123                         }
3124
3125                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3126                                 log_error("mount(MS_MOVE) failed: %m");
3127                                 goto child_fail;
3128                         }
3129
3130                         if (chroot(".") < 0) {
3131                                 log_error("chroot() failed: %m");
3132                                 goto child_fail;
3133                         }
3134
3135                         if (chdir("/") < 0) {
3136                                 log_error("chdir() failed: %m");
3137                                 goto child_fail;
3138                         }
3139
3140                         umask(0022);
3141
3142                         if (arg_private_network)
3143                                 loopback_setup();
3144
3145                         if (drop_capabilities() < 0) {
3146                                 log_error("drop_capabilities() failed: %m");
3147                                 goto child_fail;
3148                         }
3149
3150                         r = change_uid_gid(&home);
3151                         if (r < 0)
3152                                 goto child_fail;
3153
3154                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3155                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3156                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3157                                 log_oom();
3158                                 goto child_fail;
3159                         }
3160
3161                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3162                                 char as_uuid[37];
3163
3164                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3165                                         log_oom();
3166                                         goto child_fail;
3167                                 }
3168                         }
3169
3170                         if (fdset_size(fds) > 0) {
3171                                 k = fdset_cloexec(fds, false);
3172                                 if (k < 0) {
3173                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3174                                         goto child_fail;
3175                                 }
3176
3177                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3178                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3179                                         log_oom();
3180                                         goto child_fail;
3181                                 }
3182                         }
3183
3184                         setup_hostname();
3185
3186                         if (arg_personality != 0xffffffffLU) {
3187                                 if (personality(arg_personality) < 0) {
3188                                         log_error("personality() failed: %m");
3189                                         goto child_fail;
3190                                 }
3191                         } else if (secondary) {
3192                                 if (personality(PER_LINUX32) < 0) {
3193                                         log_error("personality() failed: %m");
3194                                         goto child_fail;
3195                                 }
3196                         }
3197
3198 #ifdef HAVE_SELINUX
3199                         if (arg_selinux_context)
3200                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3201                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3202                                         goto child_fail;
3203                                 }
3204 #endif
3205
3206                         if (!strv_isempty(arg_setenv)) {
3207                                 char **n;
3208
3209                                 n = strv_env_merge(2, envp, arg_setenv);
3210                                 if (!n) {
3211                                         log_oom();
3212                                         goto child_fail;
3213                                 }
3214
3215                                 env_use = n;
3216                         } else
3217                                 env_use = (char**) envp;
3218
3219                         /* Wait until the parent is ready with the setup, too... */
3220                         r = eventfd_parent_succeeded(eventfds[0]);
3221                         eventfds[0] = safe_close(eventfds[0]);
3222                         if (r < 0)
3223                                 goto child_fail;
3224
3225                         if (arg_boot) {
3226                                 char **a;
3227                                 size_t l;
3228
3229                                 /* Automatically search for the init system */
3230
3231                                 l = 1 + argc - optind;
3232                                 a = newa(char*, l + 1);
3233                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3234
3235                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3236                                 execve(a[0], a, env_use);
3237
3238                                 a[0] = (char*) "/lib/systemd/systemd";
3239                                 execve(a[0], a, env_use);
3240
3241                                 a[0] = (char*) "/sbin/init";
3242                                 execve(a[0], a, env_use);
3243                         } else if (argc > optind)
3244                                 execvpe(argv[optind], argv + optind, env_use);
3245                         else {
3246                                 chdir(home ? home : "/root");
3247                                 execle("/bin/bash", "-bash", NULL, env_use);
3248                                 execle("/bin/sh", "-sh", NULL, env_use);
3249                         }
3250
3251                         log_error("execv() failed: %m");
3252
3253                 child_fail:
3254                         /* Tell the parent that the setup failed, so he
3255                          * can clean up resources and terminate. */
3256                         if (eventfds[1] != -1)
3257                                 eventfd_send_state(eventfds[1],
3258                                                    EVENTFD_CHILD_FAILED);
3259                         _exit(EXIT_FAILURE);
3260                 }
3261
3262                 fdset_free(fds);
3263                 fds = NULL;
3264
3265                 /* Wait for the child event:
3266                  * If EVENTFD_CHILD_FAILED, the child will terminate soon.
3267                  * If EVENTFD_CHILD_SUCCEEDED, the child is reporting that
3268                  * it is ready with all it needs to do with priviliges.
3269                  * After we got the notification we can make the process
3270                  * join its cgroup which might limit what it can do */
3271                 r = eventfd_child_succeeded(eventfds[1]);
3272                 eventfds[1] = safe_close(eventfds[1]);
3273
3274                 if (r >= 0) {
3275                         r = register_machine(pid);
3276                         if (r < 0)
3277                                 goto finish;
3278
3279                         r = move_network_interfaces(pid);
3280                         if (r < 0)
3281                                 goto finish;
3282
3283                         r = setup_veth(pid, veth_name);
3284                         if (r < 0)
3285                                 goto finish;
3286
3287                         r = setup_bridge(veth_name);
3288                         if (r < 0)
3289                                 goto finish;
3290
3291                         r = setup_macvlan(pid);
3292                         if (r < 0)
3293                                 goto finish;
3294
3295                         /* Block SIGCHLD here, before notifying child.
3296                          * process_pty() will handle it with the other signals. */
3297                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3298                         if (r < 0)
3299                                 goto finish;
3300
3301                         /* Reset signal to default */
3302                         r = default_signals(SIGCHLD, -1);
3303                         if (r < 0)
3304                                 goto finish;
3305
3306                         /* Notify the child that the parent is ready with all
3307                          * its setup, and that the child can now hand over
3308                          * control to the code to run inside the container. */
3309                         r = eventfd_send_state(eventfds[0], EVENTFD_PARENT_SUCCEEDED);
3310                         eventfds[0] = safe_close(eventfds[0]);
3311                         if (r < 0)
3312                                 goto finish;
3313
3314                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3315                         if (k < 0) {
3316                                 r = EXIT_FAILURE;
3317                                 break;
3318                         }
3319
3320                         if (!arg_quiet)
3321                                 putc('\n', stdout);
3322
3323                         /* Kill if it is not dead yet anyway */
3324                         terminate_machine(pid);
3325                 }
3326
3327                 /* Normally redundant, but better safe than sorry */
3328                 kill(pid, SIGKILL);
3329
3330                 r = wait_for_container(pid, &container_status);
3331                 pid = 0;
3332
3333                 if (r < 0) {
3334                         /* We failed to wait for the container, or the
3335                          * container exited abnormally */
3336                         r = EXIT_FAILURE;
3337                         break;
3338                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3339                         /* The container exited with a non-zero
3340                          * status, or with zero status and no reboot
3341                          * was requested. */
3342                         break;
3343
3344                 /* CONTAINER_REBOOTED, loop again */
3345         }
3346
3347 finish:
3348         loop_remove(loop_nr, &image_fd);
3349
3350         if (pid > 0)
3351                 kill(pid, SIGKILL);
3352
3353         free(arg_directory);
3354         free(arg_machine);
3355         free(arg_user);
3356         strv_free(arg_setenv);
3357         strv_free(arg_network_interfaces);
3358         strv_free(arg_network_macvlan);
3359         strv_free(arg_bind);
3360         strv_free(arg_bind_ro);
3361         strv_free(arg_tmpfs);
3362
3363         return r;
3364 }