chiark / gitweb /
barrier: initalize file descriptors with -1
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static int help(void) {
170
171         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
172                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
173                "  -h --help                 Show this help\n"
174                "     --version              Print version string\n"
175                "  -q --quiet                Do not show status information\n"
176                "  -D --directory=PATH       Root directory for the container\n"
177                "  -i --image=PATH           File system device or image for the container\n"
178                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
179                "  -u --user=USER            Run the command under specified user or uid\n"
180                "  -M --machine=NAME         Set the machine name for the container\n"
181                "     --uuid=UUID            Set a specific machine UUID for the container\n"
182                "  -S --slice=SLICE          Place the container in the specified slice\n"
183                "     --private-network      Disable network in container\n"
184                "     --network-interface=INTERFACE\n"
185                "                            Assign an existing network interface to the\n"
186                "                            container\n"
187                "     --network-macvlan=INTERFACE\n"
188                "                            Create a macvlan network interface based on an\n"
189                "                            existing network interface to the container\n"
190                "     --network-veth         Add a virtual ethernet connection between host\n"
191                "                            and container\n"
192                "     --network-bridge=INTERFACE\n"
193                "                            Add a virtual ethernet connection between host\n"
194                "                            and container and add it to an existing bridge on\n"
195                "                            the host\n"
196                "  -Z --selinux-context=SECLABEL\n"
197                "                            Set the SELinux security context to be used by\n"
198                "                            processes in the container\n"
199                "  -L --selinux-apifs-context=SECLABEL\n"
200                "                            Set the SELinux security context to be used by\n"
201                "                            API/tmpfs file systems in the container\n"
202                "     --capability=CAP       In addition to the default, retain specified\n"
203                "                            capability\n"
204                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
205                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
206                "  -j                        Equivalent to --link-journal=host\n"
207                "     --read-only            Mount the root directory read-only\n"
208                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
209                "                            the container\n"
210                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
211                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
212                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
213                "     --share-system         Share system namespaces with host\n"
214                "     --register=BOOLEAN     Register container as machine\n"
215                "     --keep-unit            Do not register a scope for the machine, reuse\n"
216                "                            the service unit nspawn is running in\n"
217                "     --volatile[=MODE]      Run the system in volatile mode\n",
218                program_invocation_short_name);
219
220         return 0;
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225         enum {
226                 ARG_VERSION = 0x100,
227                 ARG_PRIVATE_NETWORK,
228                 ARG_UUID,
229                 ARG_READ_ONLY,
230                 ARG_CAPABILITY,
231                 ARG_DROP_CAPABILITY,
232                 ARG_LINK_JOURNAL,
233                 ARG_BIND,
234                 ARG_BIND_RO,
235                 ARG_TMPFS,
236                 ARG_SETENV,
237                 ARG_SHARE_SYSTEM,
238                 ARG_REGISTER,
239                 ARG_KEEP_UNIT,
240                 ARG_NETWORK_INTERFACE,
241                 ARG_NETWORK_MACVLAN,
242                 ARG_NETWORK_VETH,
243                 ARG_NETWORK_BRIDGE,
244                 ARG_PERSONALITY,
245                 ARG_VOLATILE,
246         };
247
248         static const struct option options[] = {
249                 { "help",                  no_argument,       NULL, 'h'                   },
250                 { "version",               no_argument,       NULL, ARG_VERSION           },
251                 { "directory",             required_argument, NULL, 'D'                   },
252                 { "user",                  required_argument, NULL, 'u'                   },
253                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
254                 { "boot",                  no_argument,       NULL, 'b'                   },
255                 { "uuid",                  required_argument, NULL, ARG_UUID              },
256                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
257                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
258                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
259                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
260                 { "bind",                  required_argument, NULL, ARG_BIND              },
261                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
262                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
263                 { "machine",               required_argument, NULL, 'M'                   },
264                 { "slice",                 required_argument, NULL, 'S'                   },
265                 { "setenv",                required_argument, NULL, ARG_SETENV            },
266                 { "selinux-context",       required_argument, NULL, 'Z'                   },
267                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
268                 { "quiet",                 no_argument,       NULL, 'q'                   },
269                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
270                 { "register",              required_argument, NULL, ARG_REGISTER          },
271                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
272                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
273                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
274                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
275                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
276                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
277                 { "image",                 required_argument, NULL, 'i'                   },
278                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
279                 {}
280         };
281
282         int c, r;
283         uint64_t plus = 0, minus = 0;
284
285         assert(argc >= 0);
286         assert(argv);
287
288         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
289
290                 switch (c) {
291
292                 case 'h':
293                         return help();
294
295                 case ARG_VERSION:
296                         puts(PACKAGE_STRING);
297                         puts(SYSTEMD_FEATURES);
298                         return 0;
299
300                 case 'D':
301                         free(arg_directory);
302                         arg_directory = canonicalize_file_name(optarg);
303                         if (!arg_directory) {
304                                 log_error("Invalid root directory: %m");
305                                 return -ENOMEM;
306                         }
307
308                         break;
309
310                 case 'i':
311                         arg_image = optarg;
312                         break;
313
314                 case 'u':
315                         free(arg_user);
316                         arg_user = strdup(optarg);
317                         if (!arg_user)
318                                 return log_oom();
319
320                         break;
321
322                 case ARG_NETWORK_BRIDGE:
323                         arg_network_bridge = optarg;
324
325                         /* fall through */
326
327                 case ARG_NETWORK_VETH:
328                         arg_network_veth = true;
329                         arg_private_network = true;
330                         break;
331
332                 case ARG_NETWORK_INTERFACE:
333                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
334                                 return log_oom();
335
336                         arg_private_network = true;
337                         break;
338
339                 case ARG_NETWORK_MACVLAN:
340                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
341                                 return log_oom();
342
343                         /* fall through */
344
345                 case ARG_PRIVATE_NETWORK:
346                         arg_private_network = true;
347                         break;
348
349                 case 'b':
350                         arg_boot = true;
351                         break;
352
353                 case ARG_UUID:
354                         r = sd_id128_from_string(optarg, &arg_uuid);
355                         if (r < 0) {
356                                 log_error("Invalid UUID: %s", optarg);
357                                 return r;
358                         }
359                         break;
360
361                 case 'S':
362                         arg_slice = optarg;
363                         break;
364
365                 case 'M':
366                         if (isempty(optarg)) {
367                                 free(arg_machine);
368                                 arg_machine = NULL;
369                         } else {
370
371                                 if (!hostname_is_valid(optarg)) {
372                                         log_error("Invalid machine name: %s", optarg);
373                                         return -EINVAL;
374                                 }
375
376                                 free(arg_machine);
377                                 arg_machine = strdup(optarg);
378                                 if (!arg_machine)
379                                         return log_oom();
380
381                                 break;
382                         }
383
384                 case 'Z':
385                         arg_selinux_context = optarg;
386                         break;
387
388                 case 'L':
389                         arg_selinux_apifs_context = optarg;
390                         break;
391
392                 case ARG_READ_ONLY:
393                         arg_read_only = true;
394                         break;
395
396                 case ARG_CAPABILITY:
397                 case ARG_DROP_CAPABILITY: {
398                         char *state, *word;
399                         size_t length;
400
401                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
402                                 _cleanup_free_ char *t;
403                                 cap_value_t cap;
404
405                                 t = strndup(word, length);
406                                 if (!t)
407                                         return log_oom();
408
409                                 if (streq(t, "all")) {
410                                         if (c == ARG_CAPABILITY)
411                                                 plus = (uint64_t) -1;
412                                         else
413                                                 minus = (uint64_t) -1;
414                                 } else {
415                                         if (cap_from_name(t, &cap) < 0) {
416                                                 log_error("Failed to parse capability %s.", t);
417                                                 return -EINVAL;
418                                         }
419
420                                         if (c == ARG_CAPABILITY)
421                                                 plus |= 1ULL << (uint64_t) cap;
422                                         else
423                                                 minus |= 1ULL << (uint64_t) cap;
424                                 }
425                         }
426
427                         break;
428                 }
429
430                 case 'j':
431                         arg_link_journal = LINK_GUEST;
432                         break;
433
434                 case ARG_LINK_JOURNAL:
435                         if (streq(optarg, "auto"))
436                                 arg_link_journal = LINK_AUTO;
437                         else if (streq(optarg, "no"))
438                                 arg_link_journal = LINK_NO;
439                         else if (streq(optarg, "guest"))
440                                 arg_link_journal = LINK_GUEST;
441                         else if (streq(optarg, "host"))
442                                 arg_link_journal = LINK_HOST;
443                         else {
444                                 log_error("Failed to parse link journal mode %s", optarg);
445                                 return -EINVAL;
446                         }
447
448                         break;
449
450                 case ARG_BIND:
451                 case ARG_BIND_RO: {
452                         _cleanup_free_ char *a = NULL, *b = NULL;
453                         char *e;
454                         char ***x;
455
456                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
457
458                         e = strchr(optarg, ':');
459                         if (e) {
460                                 a = strndup(optarg, e - optarg);
461                                 b = strdup(e + 1);
462                         } else {
463                                 a = strdup(optarg);
464                                 b = strdup(optarg);
465                         }
466
467                         if (!a || !b)
468                                 return log_oom();
469
470                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
471                                 log_error("Invalid bind mount specification: %s", optarg);
472                                 return -EINVAL;
473                         }
474
475                         r = strv_extend(x, a);
476                         if (r < 0)
477                                 return log_oom();
478
479                         r = strv_extend(x, b);
480                         if (r < 0)
481                                 return log_oom();
482
483                         break;
484                 }
485
486                 case ARG_TMPFS: {
487                         _cleanup_free_ char *a = NULL, *b = NULL;
488                         char *e;
489
490                         e = strchr(optarg, ':');
491                         if (e) {
492                                 a = strndup(optarg, e - optarg);
493                                 b = strdup(e + 1);
494                         } else {
495                                 a = strdup(optarg);
496                                 b = strdup("mode=0755");
497                         }
498
499                         if (!a || !b)
500                                 return log_oom();
501
502                         if (!path_is_absolute(a)) {
503                                 log_error("Invalid tmpfs specification: %s", optarg);
504                                 return -EINVAL;
505                         }
506
507                         r = strv_push(&arg_tmpfs, a);
508                         if (r < 0)
509                                 return log_oom();
510
511                         a = NULL;
512
513                         r = strv_push(&arg_tmpfs, b);
514                         if (r < 0)
515                                 return log_oom();
516
517                         b = NULL;
518
519                         break;
520                 }
521
522                 case ARG_SETENV: {
523                         char **n;
524
525                         if (!env_assignment_is_valid(optarg)) {
526                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
527                                 return -EINVAL;
528                         }
529
530                         n = strv_env_set(arg_setenv, optarg);
531                         if (!n)
532                                 return log_oom();
533
534                         strv_free(arg_setenv);
535                         arg_setenv = n;
536                         break;
537                 }
538
539                 case 'q':
540                         arg_quiet = true;
541                         break;
542
543                 case ARG_SHARE_SYSTEM:
544                         arg_share_system = true;
545                         break;
546
547                 case ARG_REGISTER:
548                         r = parse_boolean(optarg);
549                         if (r < 0) {
550                                 log_error("Failed to parse --register= argument: %s", optarg);
551                                 return r;
552                         }
553
554                         arg_register = r;
555                         break;
556
557                 case ARG_KEEP_UNIT:
558                         arg_keep_unit = true;
559                         break;
560
561                 case ARG_PERSONALITY:
562
563                         arg_personality = personality_from_string(optarg);
564                         if (arg_personality == 0xffffffffLU) {
565                                 log_error("Unknown or unsupported personality '%s'.", optarg);
566                                 return -EINVAL;
567                         }
568
569                         break;
570
571                 case ARG_VOLATILE:
572
573                         if (!optarg)
574                                 arg_volatile = VOLATILE_YES;
575                         else {
576                                 r = parse_boolean(optarg);
577                                 if (r < 0) {
578                                         if (streq(optarg, "state"))
579                                                 arg_volatile = VOLATILE_STATE;
580                                         else {
581                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
582                                                 return r;
583                                         }
584                                 } else
585                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
586                         }
587
588                         break;
589
590                 case '?':
591                         return -EINVAL;
592
593                 default:
594                         assert_not_reached("Unhandled option");
595                 }
596         }
597
598         if (arg_share_system)
599                 arg_register = false;
600
601         if (arg_boot && arg_share_system) {
602                 log_error("--boot and --share-system may not be combined.");
603                 return -EINVAL;
604         }
605
606         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
607                 log_error("--keep-unit may not be used when invoked from a user session.");
608                 return -EINVAL;
609         }
610
611         if (arg_directory && arg_image) {
612                 log_error("--directory= and --image= may not be combined.");
613                 return -EINVAL;
614         }
615
616         if (arg_volatile != VOLATILE_NO && arg_read_only) {
617                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
618                 return -EINVAL;
619         }
620
621         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
622
623         return 1;
624 }
625
626 static int mount_all(const char *dest) {
627
628         typedef struct MountPoint {
629                 const char *what;
630                 const char *where;
631                 const char *type;
632                 const char *options;
633                 unsigned long flags;
634                 bool fatal;
635         } MountPoint;
636
637         static const MountPoint mount_table[] = {
638                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
639                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
640                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
641                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
642                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
643                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
644                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
645                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
646 #ifdef HAVE_SELINUX
647                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
648                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
649 #endif
650         };
651
652         unsigned k;
653         int r = 0;
654
655         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
656                 _cleanup_free_ char *where = NULL;
657 #ifdef HAVE_SELINUX
658                 _cleanup_free_ char *options = NULL;
659 #endif
660                 const char *o;
661                 int t;
662
663                 where = strjoin(dest, "/", mount_table[k].where, NULL);
664                 if (!where)
665                         return log_oom();
666
667                 t = path_is_mount_point(where, true);
668                 if (t < 0) {
669                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
670
671                         if (r == 0)
672                                 r = t;
673
674                         continue;
675                 }
676
677                 /* Skip this entry if it is not a remount. */
678                 if (mount_table[k].what && t > 0)
679                         continue;
680
681                 mkdir_p(where, 0755);
682
683 #ifdef HAVE_SELINUX
684                 if (arg_selinux_apifs_context &&
685                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
686                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
687                         if (!options)
688                                 return log_oom();
689
690                         o = options;
691                 } else
692 #endif
693                         o = mount_table[k].options;
694
695
696                 if (mount(mount_table[k].what,
697                           where,
698                           mount_table[k].type,
699                           mount_table[k].flags,
700                           o) < 0 &&
701                     mount_table[k].fatal) {
702
703                         log_error("mount(%s) failed: %m", where);
704
705                         if (r == 0)
706                                 r = -errno;
707                 }
708         }
709
710         return r;
711 }
712
713 static int mount_binds(const char *dest, char **l, bool ro) {
714         char **x, **y;
715
716         STRV_FOREACH_PAIR(x, y, l) {
717                 _cleanup_free_ char *where = NULL;
718                 struct stat source_st, dest_st;
719                 int r;
720
721                 if (stat(*x, &source_st) < 0) {
722                         log_error("Failed to stat %s: %m", *x);
723                         return -errno;
724                 }
725
726                 where = strappend(dest, *y);
727                 if (!where)
728                         return log_oom();
729
730                 r = stat(where, &dest_st);
731                 if (r == 0) {
732                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
733                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
734                                 return -EINVAL;
735                         }
736                 } else if (errno == ENOENT) {
737                         r = mkdir_parents_label(where, 0755);
738                         if (r < 0) {
739                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
740                                 return r;
741                         }
742                 } else {
743                         log_error("Failed to bind mount %s: %m", *x);
744                         return -errno;
745                 }
746
747                 /* Create the mount point, but be conservative -- refuse to create block
748                  * and char devices. */
749                 if (S_ISDIR(source_st.st_mode))
750                         mkdir_label(where, 0755);
751                 else if (S_ISFIFO(source_st.st_mode))
752                         mkfifo(where, 0644);
753                 else if (S_ISSOCK(source_st.st_mode))
754                         mknod(where, 0644 | S_IFSOCK, 0);
755                 else if (S_ISREG(source_st.st_mode))
756                         touch(where);
757                 else {
758                         log_error("Refusing to create mountpoint for file: %s", *x);
759                         return -ENOTSUP;
760                 }
761
762                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
763                         log_error("mount(%s) failed: %m", where);
764                         return -errno;
765                 }
766
767                 if (ro) {
768                         r = bind_remount_recursive(where, true);
769                         if (r < 0) {
770                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
771                                 return r;
772                         }
773                 }
774         }
775
776         return 0;
777 }
778
779 static int mount_tmpfs(const char *dest) {
780         char **i, **o;
781
782         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
783                 _cleanup_free_ char *where = NULL;
784
785                 where = strappend(dest, *i);
786                 if (!where)
787                         return log_oom();
788
789                 mkdir_label(where, 0755);
790
791                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
792                         log_error("tmpfs mount to %s failed: %m", where);
793                         return -errno;
794                 }
795         }
796
797         return 0;
798 }
799
800 static int setup_timezone(const char *dest) {
801         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
802         char *z, *y;
803         int r;
804
805         assert(dest);
806
807         /* Fix the timezone, if possible */
808         r = readlink_malloc("/etc/localtime", &p);
809         if (r < 0) {
810                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
811                 return 0;
812         }
813
814         z = path_startswith(p, "../usr/share/zoneinfo/");
815         if (!z)
816                 z = path_startswith(p, "/usr/share/zoneinfo/");
817         if (!z) {
818                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
819                 return 0;
820         }
821
822         where = strappend(dest, "/etc/localtime");
823         if (!where)
824                 return log_oom();
825
826         r = readlink_malloc(where, &q);
827         if (r >= 0) {
828                 y = path_startswith(q, "../usr/share/zoneinfo/");
829                 if (!y)
830                         y = path_startswith(q, "/usr/share/zoneinfo/");
831
832                 /* Already pointing to the right place? Then do nothing .. */
833                 if (y && streq(y, z))
834                         return 0;
835         }
836
837         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
838         if (!check)
839                 return log_oom();
840
841         if (access(check, F_OK) < 0) {
842                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
843                 return 0;
844         }
845
846         what = strappend("../usr/share/zoneinfo/", z);
847         if (!what)
848                 return log_oom();
849
850         mkdir_parents(where, 0755);
851         unlink(where);
852
853         if (symlink(what, where) < 0) {
854                 log_error("Failed to correct timezone of container: %m");
855                 return 0;
856         }
857
858         return 0;
859 }
860
861 static int setup_resolv_conf(const char *dest) {
862         _cleanup_free_ char *where = NULL;
863
864         assert(dest);
865
866         if (arg_private_network)
867                 return 0;
868
869         /* Fix resolv.conf, if possible */
870         where = strappend(dest, "/etc/resolv.conf");
871         if (!where)
872                 return log_oom();
873
874         /* We don't really care for the results of this really. If it
875          * fails, it fails, but meh... */
876         mkdir_parents(where, 0755);
877         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
878
879         return 0;
880 }
881
882 static int setup_volatile_state(const char *directory) {
883         const char *p;
884         int r;
885
886         assert(directory);
887
888         if (arg_volatile != VOLATILE_STATE)
889                 return 0;
890
891         /* --volatile=state means we simply overmount /var
892            with a tmpfs, and the rest read-only. */
893
894         r = bind_remount_recursive(directory, true);
895         if (r < 0) {
896                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
897                 return r;
898         }
899
900         p = strappenda(directory, "/var");
901         mkdir(p, 0755);
902
903         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
904                 log_error("Failed to mount tmpfs to /var: %m");
905                 return -errno;
906         }
907
908         return 0;
909 }
910
911 static int setup_volatile(const char *directory) {
912         bool tmpfs_mounted = false, bind_mounted = false;
913         char template[] = "/tmp/nspawn-volatile-XXXXXX";
914         const char *f, *t;
915         int r;
916
917         assert(directory);
918
919         if (arg_volatile != VOLATILE_YES)
920                 return 0;
921
922         /* --volatile=yes means we mount a tmpfs to the root dir, and
923            the original /usr to use inside it, and that read-only. */
924
925         if (!mkdtemp(template)) {
926                 log_error("Failed to create temporary directory: %m");
927                 return -errno;
928         }
929
930         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
931                 log_error("Failed to mount tmpfs for root directory: %m");
932                 r = -errno;
933                 goto fail;
934         }
935
936         tmpfs_mounted = true;
937
938         f = strappenda(directory, "/usr");
939         t = strappenda(template, "/usr");
940
941         mkdir(t, 0755);
942         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
943                 log_error("Failed to create /usr bind mount: %m");
944                 r = -errno;
945                 goto fail;
946         }
947
948         bind_mounted = true;
949
950         r = bind_remount_recursive(t, true);
951         if (r < 0) {
952                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
953                 goto fail;
954         }
955
956         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
957                 log_error("Failed to move root mount: %m");
958                 r = -errno;
959                 goto fail;
960         }
961
962         rmdir(template);
963
964         return 0;
965
966 fail:
967         if (bind_mounted)
968                 umount(t);
969         if (tmpfs_mounted)
970                 umount(template);
971         rmdir(template);
972         return r;
973 }
974
975 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
976
977         snprintf(s, 37,
978                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
979                  SD_ID128_FORMAT_VAL(id));
980
981         return s;
982 }
983
984 static int setup_boot_id(const char *dest) {
985         _cleanup_free_ char *from = NULL, *to = NULL;
986         sd_id128_t rnd = {};
987         char as_uuid[37];
988         int r;
989
990         assert(dest);
991
992         if (arg_share_system)
993                 return 0;
994
995         /* Generate a new randomized boot ID, so that each boot-up of
996          * the container gets a new one */
997
998         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
999         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1000         if (!from || !to)
1001                 return log_oom();
1002
1003         r = sd_id128_randomize(&rnd);
1004         if (r < 0) {
1005                 log_error("Failed to generate random boot id: %s", strerror(-r));
1006                 return r;
1007         }
1008
1009         id128_format_as_uuid(rnd, as_uuid);
1010
1011         r = write_string_file(from, as_uuid);
1012         if (r < 0) {
1013                 log_error("Failed to write boot id: %s", strerror(-r));
1014                 return r;
1015         }
1016
1017         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1018                 log_error("Failed to bind mount boot id: %m");
1019                 r = -errno;
1020         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1021                 log_warning("Failed to make boot id read-only: %m");
1022
1023         unlink(from);
1024         return r;
1025 }
1026
1027 static int copy_devnodes(const char *dest) {
1028
1029         static const char devnodes[] =
1030                 "null\0"
1031                 "zero\0"
1032                 "full\0"
1033                 "random\0"
1034                 "urandom\0"
1035                 "tty\0";
1036
1037         const char *d;
1038         int r = 0;
1039         _cleanup_umask_ mode_t u;
1040
1041         assert(dest);
1042
1043         u = umask(0000);
1044
1045         NULSTR_FOREACH(d, devnodes) {
1046                 _cleanup_free_ char *from = NULL, *to = NULL;
1047                 struct stat st;
1048
1049                 from = strappend("/dev/", d);
1050                 to = strjoin(dest, "/dev/", d, NULL);
1051                 if (!from || !to)
1052                         return log_oom();
1053
1054                 if (stat(from, &st) < 0) {
1055
1056                         if (errno != ENOENT) {
1057                                 log_error("Failed to stat %s: %m", from);
1058                                 return -errno;
1059                         }
1060
1061                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1062
1063                         log_error("%s is not a char or block device, cannot copy", from);
1064                         return -EIO;
1065
1066                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1067
1068                         log_error("mknod(%s) failed: %m", dest);
1069                         return  -errno;
1070                 }
1071         }
1072
1073         return r;
1074 }
1075
1076 static int setup_ptmx(const char *dest) {
1077         _cleanup_free_ char *p = NULL;
1078
1079         p = strappend(dest, "/dev/ptmx");
1080         if (!p)
1081                 return log_oom();
1082
1083         if (symlink("pts/ptmx", p) < 0) {
1084                 log_error("Failed to create /dev/ptmx symlink: %m");
1085                 return -errno;
1086         }
1087
1088         return 0;
1089 }
1090
1091 static int setup_dev_console(const char *dest, const char *console) {
1092         _cleanup_umask_ mode_t u;
1093         const char *to;
1094         struct stat st;
1095         int r;
1096
1097         assert(dest);
1098         assert(console);
1099
1100         u = umask(0000);
1101
1102         if (stat("/dev/null", &st) < 0) {
1103                 log_error("Failed to stat /dev/null: %m");
1104                 return -errno;
1105         }
1106
1107         r = chmod_and_chown(console, 0600, 0, 0);
1108         if (r < 0) {
1109                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1110                 return r;
1111         }
1112
1113         /* We need to bind mount the right tty to /dev/console since
1114          * ptys can only exist on pts file systems. To have something
1115          * to bind mount things on we create a device node first, and
1116          * use /dev/null for that since we the cgroups device policy
1117          * allows us to create that freely, while we cannot create
1118          * /dev/console. (Note that the major minor doesn't actually
1119          * matter here, since we mount it over anyway). */
1120
1121         to = strappenda(dest, "/dev/console");
1122         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1123                 log_error("mknod() for /dev/console failed: %m");
1124                 return -errno;
1125         }
1126
1127         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1128                 log_error("Bind mount for /dev/console failed: %m");
1129                 return -errno;
1130         }
1131
1132         return 0;
1133 }
1134
1135 static int setup_kmsg(const char *dest, int kmsg_socket) {
1136         _cleanup_free_ char *from = NULL, *to = NULL;
1137         int r, fd, k;
1138         _cleanup_umask_ mode_t u;
1139         union {
1140                 struct cmsghdr cmsghdr;
1141                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1142         } control = {};
1143         struct msghdr mh = {
1144                 .msg_control = &control,
1145                 .msg_controllen = sizeof(control),
1146         };
1147         struct cmsghdr *cmsg;
1148
1149         assert(dest);
1150         assert(kmsg_socket >= 0);
1151
1152         u = umask(0000);
1153
1154         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1155          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1156          * on the reading side behave very similar to /proc/kmsg,
1157          * their writing side behaves differently from /dev/kmsg in
1158          * that writing blocks when nothing is reading. In order to
1159          * avoid any problems with containers deadlocking due to this
1160          * we simply make /dev/kmsg unavailable to the container. */
1161         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1162             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1163                 return log_oom();
1164
1165         if (mkfifo(from, 0600) < 0) {
1166                 log_error("mkfifo() for /dev/kmsg failed: %m");
1167                 return -errno;
1168         }
1169
1170         r = chmod_and_chown(from, 0600, 0, 0);
1171         if (r < 0) {
1172                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1173                 return r;
1174         }
1175
1176         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1177                 log_error("Bind mount for /proc/kmsg failed: %m");
1178                 return -errno;
1179         }
1180
1181         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1182         if (fd < 0) {
1183                 log_error("Failed to open fifo: %m");
1184                 return -errno;
1185         }
1186
1187         cmsg = CMSG_FIRSTHDR(&mh);
1188         cmsg->cmsg_level = SOL_SOCKET;
1189         cmsg->cmsg_type = SCM_RIGHTS;
1190         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1191         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1192
1193         mh.msg_controllen = cmsg->cmsg_len;
1194
1195         /* Store away the fd in the socket, so that it stays open as
1196          * long as we run the child */
1197         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1198         safe_close(fd);
1199
1200         if (k < 0) {
1201                 log_error("Failed to send FIFO fd: %m");
1202                 return -errno;
1203         }
1204
1205         /* And now make the FIFO unavailable as /dev/kmsg... */
1206         unlink(from);
1207         return 0;
1208 }
1209
1210 static int setup_hostname(void) {
1211
1212         if (arg_share_system)
1213                 return 0;
1214
1215         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1216                 return -errno;
1217
1218         return 0;
1219 }
1220
1221 static int setup_journal(const char *directory) {
1222         sd_id128_t machine_id, this_id;
1223         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1224         char *id;
1225         int r;
1226
1227         p = strappend(directory, "/etc/machine-id");
1228         if (!p)
1229                 return log_oom();
1230
1231         r = read_one_line_file(p, &b);
1232         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1233                 return 0;
1234         else if (r < 0) {
1235                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1236                 return r;
1237         }
1238
1239         id = strstrip(b);
1240         if (isempty(id) && arg_link_journal == LINK_AUTO)
1241                 return 0;
1242
1243         /* Verify validity */
1244         r = sd_id128_from_string(id, &machine_id);
1245         if (r < 0) {
1246                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1247                 return r;
1248         }
1249
1250         r = sd_id128_get_machine(&this_id);
1251         if (r < 0) {
1252                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1253                 return r;
1254         }
1255
1256         if (sd_id128_equal(machine_id, this_id)) {
1257                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1258                          "Host and machine ids are equal (%s): refusing to link journals", id);
1259                 if (arg_link_journal == LINK_AUTO)
1260                         return 0;
1261                 return
1262                         -EEXIST;
1263         }
1264
1265         if (arg_link_journal == LINK_NO)
1266                 return 0;
1267
1268         free(p);
1269         p = strappend("/var/log/journal/", id);
1270         q = strjoin(directory, "/var/log/journal/", id, NULL);
1271         if (!p || !q)
1272                 return log_oom();
1273
1274         if (path_is_mount_point(p, false) > 0) {
1275                 if (arg_link_journal != LINK_AUTO) {
1276                         log_error("%s: already a mount point, refusing to use for journal", p);
1277                         return -EEXIST;
1278                 }
1279
1280                 return 0;
1281         }
1282
1283         if (path_is_mount_point(q, false) > 0) {
1284                 if (arg_link_journal != LINK_AUTO) {
1285                         log_error("%s: already a mount point, refusing to use for journal", q);
1286                         return -EEXIST;
1287                 }
1288
1289                 return 0;
1290         }
1291
1292         r = readlink_and_make_absolute(p, &d);
1293         if (r >= 0) {
1294                 if ((arg_link_journal == LINK_GUEST ||
1295                      arg_link_journal == LINK_AUTO) &&
1296                     path_equal(d, q)) {
1297
1298                         r = mkdir_p(q, 0755);
1299                         if (r < 0)
1300                                 log_warning("failed to create directory %s: %m", q);
1301                         return 0;
1302                 }
1303
1304                 if (unlink(p) < 0) {
1305                         log_error("Failed to remove symlink %s: %m", p);
1306                         return -errno;
1307                 }
1308         } else if (r == -EINVAL) {
1309
1310                 if (arg_link_journal == LINK_GUEST &&
1311                     rmdir(p) < 0) {
1312
1313                         if (errno == ENOTDIR) {
1314                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1315                                 return r;
1316                         } else {
1317                                 log_error("Failed to remove %s: %m", p);
1318                                 return -errno;
1319                         }
1320                 }
1321         } else if (r != -ENOENT) {
1322                 log_error("readlink(%s) failed: %m", p);
1323                 return r;
1324         }
1325
1326         if (arg_link_journal == LINK_GUEST) {
1327
1328                 if (symlink(q, p) < 0) {
1329                         log_error("Failed to symlink %s to %s: %m", q, p);
1330                         return -errno;
1331                 }
1332
1333                 r = mkdir_p(q, 0755);
1334                 if (r < 0)
1335                         log_warning("failed to create directory %s: %m", q);
1336                 return 0;
1337         }
1338
1339         if (arg_link_journal == LINK_HOST) {
1340                 r = mkdir_p(p, 0755);
1341                 if (r < 0) {
1342                         log_error("Failed to create %s: %m", p);
1343                         return r;
1344                 }
1345
1346         } else if (access(p, F_OK) < 0)
1347                 return 0;
1348
1349         if (dir_is_empty(q) == 0)
1350                 log_warning("%s is not empty, proceeding anyway.", q);
1351
1352         r = mkdir_p(q, 0755);
1353         if (r < 0) {
1354                 log_error("Failed to create %s: %m", q);
1355                 return r;
1356         }
1357
1358         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1359                 log_error("Failed to bind mount journal from host into guest: %m");
1360                 return -errno;
1361         }
1362
1363         return 0;
1364 }
1365
1366 static int setup_kdbus(const char *dest, const char *path) {
1367         const char *p;
1368
1369         if (!path)
1370                 return 0;
1371
1372         p = strappenda(dest, "/dev/kdbus");
1373         if (mkdir(p, 0755) < 0) {
1374                 log_error("Failed to create kdbus path: %m");
1375                 return  -errno;
1376         }
1377
1378         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1379                 log_error("Failed to mount kdbus domain path: %m");
1380                 return -errno;
1381         }
1382
1383         return 0;
1384 }
1385
1386 static int drop_capabilities(void) {
1387         return capability_bounding_set_drop(~arg_retain, false);
1388 }
1389
1390 static int register_machine(pid_t pid, int local_ifindex) {
1391         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1392         _cleanup_bus_unref_ sd_bus *bus = NULL;
1393         int r;
1394
1395         if (!arg_register)
1396                 return 0;
1397
1398         r = sd_bus_default_system(&bus);
1399         if (r < 0) {
1400                 log_error("Failed to open system bus: %s", strerror(-r));
1401                 return r;
1402         }
1403
1404         if (arg_keep_unit) {
1405                 r = sd_bus_call_method(
1406                                 bus,
1407                                 "org.freedesktop.machine1",
1408                                 "/org/freedesktop/machine1",
1409                                 "org.freedesktop.machine1.Manager",
1410                                 "RegisterMachineWithNetwork",
1411                                 &error,
1412                                 NULL,
1413                                 "sayssusai",
1414                                 arg_machine,
1415                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1416                                 "nspawn",
1417                                 "container",
1418                                 (uint32_t) pid,
1419                                 strempty(arg_directory),
1420                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1421         } else {
1422                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1423
1424                 r = sd_bus_message_new_method_call(
1425                                 bus,
1426                                 &m,
1427                                 "org.freedesktop.machine1",
1428                                 "/org/freedesktop/machine1",
1429                                 "org.freedesktop.machine1.Manager",
1430                                 "CreateMachineWithNetwork");
1431                 if (r < 0) {
1432                         log_error("Failed to create message: %s", strerror(-r));
1433                         return r;
1434                 }
1435
1436                 r = sd_bus_message_append(
1437                                 m,
1438                                 "sayssusai",
1439                                 arg_machine,
1440                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1441                                 "nspawn",
1442                                 "container",
1443                                 (uint32_t) pid,
1444                                 strempty(arg_directory),
1445                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1446                 if (r < 0) {
1447                         log_error("Failed to append message arguments: %s", strerror(-r));
1448                         return r;
1449                 }
1450
1451                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1452                 if (r < 0) {
1453                         log_error("Failed to open container: %s", strerror(-r));
1454                         return r;
1455                 }
1456
1457                 if (!isempty(arg_slice)) {
1458                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1459                         if (r < 0) {
1460                                 log_error("Failed to append slice: %s", strerror(-r));
1461                                 return r;
1462                         }
1463                 }
1464
1465                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1466                 if (r < 0) {
1467                         log_error("Failed to add device policy: %s", strerror(-r));
1468                         return r;
1469                 }
1470
1471                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1472                                           /* Allow the container to
1473                                            * access and create the API
1474                                            * device nodes, so that
1475                                            * PrivateDevices= in the
1476                                            * container can work
1477                                            * fine */
1478                                           "/dev/null", "rwm",
1479                                           "/dev/zero", "rwm",
1480                                           "/dev/full", "rwm",
1481                                           "/dev/random", "rwm",
1482                                           "/dev/urandom", "rwm",
1483                                           "/dev/tty", "rwm",
1484                                           /* Allow the container
1485                                            * access to ptys. However,
1486                                            * do not permit the
1487                                            * container to ever create
1488                                            * these device nodes. */
1489                                           "/dev/pts/ptmx", "rw",
1490                                           "char-pts", "rw",
1491                                           /* Allow the container
1492                                            * access to all kdbus
1493                                            * devices. Again, the
1494                                            * container cannot create
1495                                            * these nodes, only use
1496                                            * them. We use a pretty
1497                                            * open match here, so that
1498                                            * the kernel API can still
1499                                            * change. */
1500                                           "char-kdbus", "rw",
1501                                           "char-kdbus/*", "rw");
1502                 if (r < 0) {
1503                         log_error("Failed to add device whitelist: %s", strerror(-r));
1504                         return r;
1505                 }
1506
1507                 r = sd_bus_message_close_container(m);
1508                 if (r < 0) {
1509                         log_error("Failed to close container: %s", strerror(-r));
1510                         return r;
1511                 }
1512
1513                 r = sd_bus_call(bus, m, 0, &error, NULL);
1514         }
1515
1516         if (r < 0) {
1517                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1518                 return r;
1519         }
1520
1521         return 0;
1522 }
1523
1524 static int terminate_machine(pid_t pid) {
1525         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1526         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1527         _cleanup_bus_unref_ sd_bus *bus = NULL;
1528         const char *path;
1529         int r;
1530
1531         if (!arg_register)
1532                 return 0;
1533
1534         r = sd_bus_default_system(&bus);
1535         if (r < 0) {
1536                 log_error("Failed to open system bus: %s", strerror(-r));
1537                 return r;
1538         }
1539
1540         r = sd_bus_call_method(
1541                         bus,
1542                         "org.freedesktop.machine1",
1543                         "/org/freedesktop/machine1",
1544                         "org.freedesktop.machine1.Manager",
1545                         "GetMachineByPID",
1546                         &error,
1547                         &reply,
1548                         "u",
1549                         (uint32_t) pid);
1550         if (r < 0) {
1551                 /* Note that the machine might already have been
1552                  * cleaned up automatically, hence don't consider it a
1553                  * failure if we cannot get the machine object. */
1554                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1555                 return 0;
1556         }
1557
1558         r = sd_bus_message_read(reply, "o", &path);
1559         if (r < 0)
1560                 return bus_log_parse_error(r);
1561
1562         r = sd_bus_call_method(
1563                         bus,
1564                         "org.freedesktop.machine1",
1565                         path,
1566                         "org.freedesktop.machine1.Machine",
1567                         "Terminate",
1568                         &error,
1569                         NULL,
1570                         NULL);
1571         if (r < 0) {
1572                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1573                 return 0;
1574         }
1575
1576         return 0;
1577 }
1578
1579 static int reset_audit_loginuid(void) {
1580         _cleanup_free_ char *p = NULL;
1581         int r;
1582
1583         if (arg_share_system)
1584                 return 0;
1585
1586         r = read_one_line_file("/proc/self/loginuid", &p);
1587         if (r == -ENOENT)
1588                 return 0;
1589         if (r < 0) {
1590                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1591                 return r;
1592         }
1593
1594         /* Already reset? */
1595         if (streq(p, "4294967295"))
1596                 return 0;
1597
1598         r = write_string_file("/proc/self/loginuid", "4294967295");
1599         if (r < 0) {
1600                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1601                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1602                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1603                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1604                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1605
1606                 sleep(5);
1607         }
1608
1609         return 0;
1610 }
1611
1612 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1613
1614 static int get_mac(struct ether_addr *mac) {
1615         int r;
1616
1617         uint8_t result[8];
1618         size_t l, sz;
1619         uint8_t *v;
1620
1621         l = strlen(arg_machine);
1622         sz = sizeof(sd_id128_t) + l;
1623         v = alloca(sz);
1624
1625         /* fetch some persistent data unique to the host */
1626         r = sd_id128_get_machine((sd_id128_t*) v);
1627         if (r < 0)
1628                 return r;
1629
1630         /* combine with some data unique (on this host) to this
1631          * container instance */
1632         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1633
1634         /* Let's hash the host machine ID plus the container name. We
1635          * use a fixed, but originally randomly created hash key here. */
1636         siphash24(result, v, sz, HASH_KEY.bytes);
1637
1638         assert_cc(ETH_ALEN <= sizeof(result));
1639         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1640
1641         /* see eth_random_addr in the kernel */
1642         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1643         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1644
1645         return 0;
1646 }
1647
1648 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1649         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1650         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1651         struct ether_addr mac;
1652         int r, i;
1653
1654         if (!arg_private_network)
1655                 return 0;
1656
1657         if (!arg_network_veth)
1658                 return 0;
1659
1660         /* Use two different interface name prefixes depending whether
1661          * we are in bridge mode or not. */
1662         if (arg_network_bridge)
1663                 memcpy(iface_name, "vb-", 3);
1664         else
1665                 memcpy(iface_name, "ve-", 3);
1666         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1667
1668         r = get_mac(&mac);
1669         if (r < 0) {
1670                 log_error("Failed to generate predictable MAC address for host0");
1671                 return r;
1672         }
1673
1674         r = sd_rtnl_open(&rtnl, 0);
1675         if (r < 0) {
1676                 log_error("Failed to connect to netlink: %s", strerror(-r));
1677                 return r;
1678         }
1679
1680         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1681         if (r < 0) {
1682                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1683                 return r;
1684         }
1685
1686         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1687         if (r < 0) {
1688                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1689                 return r;
1690         }
1691
1692         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1693         if (r < 0) {
1694                 log_error("Failed to open netlink container: %s", strerror(-r));
1695                 return r;
1696         }
1697
1698         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1699         if (r < 0) {
1700                 log_error("Failed to open netlink container: %s", strerror(-r));
1701                 return r;
1702         }
1703
1704         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1705         if (r < 0) {
1706                 log_error("Failed to open netlink container: %s", strerror(-r));
1707                 return r;
1708         }
1709
1710         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1711         if (r < 0) {
1712                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1713                 return r;
1714         }
1715
1716         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1717         if (r < 0) {
1718                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1719                 return r;
1720         }
1721
1722         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1723         if (r < 0) {
1724                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1725                 return r;
1726         }
1727
1728         r = sd_rtnl_message_close_container(m);
1729         if (r < 0) {
1730                 log_error("Failed to close netlink container: %s", strerror(-r));
1731                 return r;
1732         }
1733
1734         r = sd_rtnl_message_close_container(m);
1735         if (r < 0) {
1736                 log_error("Failed to close netlink container: %s", strerror(-r));
1737                 return r;
1738         }
1739
1740         r = sd_rtnl_message_close_container(m);
1741         if (r < 0) {
1742                 log_error("Failed to close netlink container: %s", strerror(-r));
1743                 return r;
1744         }
1745
1746         r = sd_rtnl_call(rtnl, m, 0, NULL);
1747         if (r < 0) {
1748                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1749                 return r;
1750         }
1751
1752         i = (int) if_nametoindex(iface_name);
1753         if (i <= 0) {
1754                 log_error("Failed to resolve interface %s: %m", iface_name);
1755                 return -errno;
1756         }
1757
1758         *ifi = i;
1759
1760         return 0;
1761 }
1762
1763 static int setup_bridge(const char veth_name[], int *ifi) {
1764         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1765         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1766         int r, bridge;
1767
1768         if (!arg_private_network)
1769                 return 0;
1770
1771         if (!arg_network_veth)
1772                 return 0;
1773
1774         if (!arg_network_bridge)
1775                 return 0;
1776
1777         bridge = (int) if_nametoindex(arg_network_bridge);
1778         if (bridge <= 0) {
1779                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1780                 return -errno;
1781         }
1782
1783         *ifi = bridge;
1784
1785         r = sd_rtnl_open(&rtnl, 0);
1786         if (r < 0) {
1787                 log_error("Failed to connect to netlink: %s", strerror(-r));
1788                 return r;
1789         }
1790
1791         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1792         if (r < 0) {
1793                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1794                 return r;
1795         }
1796
1797         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1798         if (r < 0) {
1799                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1800                 return r;
1801         }
1802
1803         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1804         if (r < 0) {
1805                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1806                 return r;
1807         }
1808
1809         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1810         if (r < 0) {
1811                 log_error("Failed to add netlink master field: %s", strerror(-r));
1812                 return r;
1813         }
1814
1815         r = sd_rtnl_call(rtnl, m, 0, NULL);
1816         if (r < 0) {
1817                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1818                 return r;
1819         }
1820
1821         return 0;
1822 }
1823
1824 static int parse_interface(struct udev *udev, const char *name) {
1825         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1826         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1827         int ifi;
1828
1829         ifi = (int) if_nametoindex(name);
1830         if (ifi <= 0) {
1831                 log_error("Failed to resolve interface %s: %m", name);
1832                 return -errno;
1833         }
1834
1835         sprintf(ifi_str, "n%i", ifi);
1836         d = udev_device_new_from_device_id(udev, ifi_str);
1837         if (!d) {
1838                 log_error("Failed to get udev device for interface %s: %m", name);
1839                 return -errno;
1840         }
1841
1842         if (udev_device_get_is_initialized(d) <= 0) {
1843                 log_error("Network interface %s is not initialized yet.", name);
1844                 return -EBUSY;
1845         }
1846
1847         return ifi;
1848 }
1849
1850 static int move_network_interfaces(pid_t pid) {
1851         _cleanup_udev_unref_ struct udev *udev = NULL;
1852         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1853         char **i;
1854         int r;
1855
1856         if (!arg_private_network)
1857                 return 0;
1858
1859         if (strv_isempty(arg_network_interfaces))
1860                 return 0;
1861
1862         r = sd_rtnl_open(&rtnl, 0);
1863         if (r < 0) {
1864                 log_error("Failed to connect to netlink: %s", strerror(-r));
1865                 return r;
1866         }
1867
1868         udev = udev_new();
1869         if (!udev) {
1870                 log_error("Failed to connect to udev.");
1871                 return -ENOMEM;
1872         }
1873
1874         STRV_FOREACH(i, arg_network_interfaces) {
1875                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1876                 int ifi;
1877
1878                 ifi = parse_interface(udev, *i);
1879                 if (ifi < 0)
1880                         return ifi;
1881
1882                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1883                 if (r < 0) {
1884                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1885                         return r;
1886                 }
1887
1888                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1889                 if (r < 0) {
1890                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1891                         return r;
1892                 }
1893
1894                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1895                 if (r < 0) {
1896                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1897                         return r;
1898                 }
1899         }
1900
1901         return 0;
1902 }
1903
1904 static int setup_macvlan(pid_t pid) {
1905         _cleanup_udev_unref_ struct udev *udev = NULL;
1906         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1907         char **i;
1908         int r;
1909
1910         if (!arg_private_network)
1911                 return 0;
1912
1913         if (strv_isempty(arg_network_macvlan))
1914                 return 0;
1915
1916         r = sd_rtnl_open(&rtnl, 0);
1917         if (r < 0) {
1918                 log_error("Failed to connect to netlink: %s", strerror(-r));
1919                 return r;
1920         }
1921
1922         udev = udev_new();
1923         if (!udev) {
1924                 log_error("Failed to connect to udev.");
1925                 return -ENOMEM;
1926         }
1927
1928         STRV_FOREACH(i, arg_network_macvlan) {
1929                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1930                 _cleanup_free_ char *n = NULL;
1931                 int ifi;
1932
1933                 ifi = parse_interface(udev, *i);
1934                 if (ifi < 0)
1935                         return ifi;
1936
1937                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1938                 if (r < 0) {
1939                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1940                         return r;
1941                 }
1942
1943                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1944                 if (r < 0) {
1945                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1946                         return r;
1947                 }
1948
1949                 n = strappend("mv-", *i);
1950                 if (!n)
1951                         return log_oom();
1952
1953                 strshorten(n, IFNAMSIZ-1);
1954
1955                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1956                 if (r < 0) {
1957                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1958                         return r;
1959                 }
1960
1961                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1962                 if (r < 0) {
1963                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1964                         return r;
1965                 }
1966
1967                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1968                 if (r < 0) {
1969                         log_error("Failed to open netlink container: %s", strerror(-r));
1970                         return r;
1971                 }
1972
1973                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1974                 if (r < 0) {
1975                         log_error("Failed to open netlink container: %s", strerror(-r));
1976                         return r;
1977                 }
1978
1979                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1980                 if (r < 0) {
1981                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1982                         return r;
1983                 }
1984
1985                 r = sd_rtnl_message_close_container(m);
1986                 if (r < 0) {
1987                         log_error("Failed to close netlink container: %s", strerror(-r));
1988                         return r;
1989                 }
1990
1991                 r = sd_rtnl_message_close_container(m);
1992                 if (r < 0) {
1993                         log_error("Failed to close netlink container: %s", strerror(-r));
1994                         return r;
1995                 }
1996
1997                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1998                 if (r < 0) {
1999                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2000                         return r;
2001                 }
2002         }
2003
2004         return 0;
2005 }
2006
2007 static int setup_seccomp(void) {
2008
2009 #ifdef HAVE_SECCOMP
2010         static const int blacklist[] = {
2011                 SCMP_SYS(kexec_load),
2012                 SCMP_SYS(open_by_handle_at),
2013                 SCMP_SYS(init_module),
2014                 SCMP_SYS(finit_module),
2015                 SCMP_SYS(delete_module),
2016                 SCMP_SYS(iopl),
2017                 SCMP_SYS(ioperm),
2018                 SCMP_SYS(swapon),
2019                 SCMP_SYS(swapoff),
2020         };
2021
2022         scmp_filter_ctx seccomp;
2023         unsigned i;
2024         int r;
2025
2026         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2027         if (!seccomp)
2028                 return log_oom();
2029
2030         r = seccomp_add_secondary_archs(seccomp);
2031         if (r < 0) {
2032                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2033                 goto finish;
2034         }
2035
2036         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2037                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2038                 if (r == -EFAULT)
2039                         continue; /* unknown syscall */
2040                 if (r < 0) {
2041                         log_error("Failed to block syscall: %s", strerror(-r));
2042                         goto finish;
2043                 }
2044         }
2045
2046         /*
2047            Audit is broken in containers, much of the userspace audit
2048            hookup will fail if running inside a container. We don't
2049            care and just turn off creation of audit sockets.
2050
2051            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2052            with EAFNOSUPPORT which audit userspace uses as indication
2053            that audit is disabled in the kernel.
2054          */
2055
2056         r = seccomp_rule_add(
2057                         seccomp,
2058                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2059                         SCMP_SYS(socket),
2060                         2,
2061                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2062                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2063         if (r < 0) {
2064                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2065                 goto finish;
2066         }
2067
2068         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2069         if (r < 0) {
2070                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2071                 goto finish;
2072         }
2073
2074         r = seccomp_load(seccomp);
2075         if (r < 0)
2076                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2077
2078 finish:
2079         seccomp_release(seccomp);
2080         return r;
2081 #else
2082         return 0;
2083 #endif
2084
2085 }
2086
2087 static int setup_image(char **device_path, int *loop_nr) {
2088         struct loop_info64 info = {
2089                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2090         };
2091         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2092         _cleanup_free_ char* loopdev = NULL;
2093         struct stat st;
2094         int r, nr;
2095
2096         assert(device_path);
2097         assert(loop_nr);
2098
2099         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2100         if (fd < 0) {
2101                 log_error("Failed to open %s: %m", arg_image);
2102                 return -errno;
2103         }
2104
2105         if (fstat(fd, &st) < 0) {
2106                 log_error("Failed to stat %s: %m", arg_image);
2107                 return -errno;
2108         }
2109
2110         if (S_ISBLK(st.st_mode)) {
2111                 char *p;
2112
2113                 p = strdup(arg_image);
2114                 if (!p)
2115                         return log_oom();
2116
2117                 *device_path = p;
2118
2119                 *loop_nr = -1;
2120
2121                 r = fd;
2122                 fd = -1;
2123
2124                 return r;
2125         }
2126
2127         if (!S_ISREG(st.st_mode)) {
2128                 log_error("%s is not a regular file or block device: %m", arg_image);
2129                 return -EINVAL;
2130         }
2131
2132         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2133         if (control < 0) {
2134                 log_error("Failed to open /dev/loop-control: %m");
2135                 return -errno;
2136         }
2137
2138         nr = ioctl(control, LOOP_CTL_GET_FREE);
2139         if (nr < 0) {
2140                 log_error("Failed to allocate loop device: %m");
2141                 return -errno;
2142         }
2143
2144         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2145                 return log_oom();
2146
2147         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2148         if (loop < 0) {
2149                 log_error("Failed to open loop device %s: %m", loopdev);
2150                 return -errno;
2151         }
2152
2153         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2154                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2155                 return -errno;
2156         }
2157
2158         if (arg_read_only)
2159                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2160
2161         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2162                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2163                 return -errno;
2164         }
2165
2166         *device_path = loopdev;
2167         loopdev = NULL;
2168
2169         *loop_nr = nr;
2170
2171         r = loop;
2172         loop = -1;
2173
2174         return r;
2175 }
2176
2177 static int dissect_image(
2178                 int fd,
2179                 char **root_device, bool *root_device_rw,
2180                 char **home_device, bool *home_device_rw,
2181                 char **srv_device, bool *srv_device_rw,
2182                 bool *secondary) {
2183
2184 #ifdef HAVE_BLKID
2185         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2186         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2187         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2188         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2189         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2190         _cleanup_udev_unref_ struct udev *udev = NULL;
2191         struct udev_list_entry *first, *item;
2192         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2193         const char *pttype = NULL;
2194         blkid_partlist pl;
2195         struct stat st;
2196         int r;
2197
2198         assert(fd >= 0);
2199         assert(root_device);
2200         assert(home_device);
2201         assert(srv_device);
2202         assert(secondary);
2203
2204         b = blkid_new_probe();
2205         if (!b)
2206                 return log_oom();
2207
2208         errno = 0;
2209         r = blkid_probe_set_device(b, fd, 0, 0);
2210         if (r != 0) {
2211                 if (errno == 0)
2212                         return log_oom();
2213
2214                 log_error("Failed to set device on blkid probe: %m");
2215                 return -errno;
2216         }
2217
2218         blkid_probe_enable_partitions(b, 1);
2219         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2220
2221         errno = 0;
2222         r = blkid_do_safeprobe(b);
2223         if (r == -2 || r == 1) {
2224                 log_error("Failed to identify any partition table on %s.\n"
2225                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2226                 return -EINVAL;
2227         } else if (r != 0) {
2228                 if (errno == 0)
2229                         errno = EIO;
2230                 log_error("Failed to probe: %m");
2231                 return -errno;
2232         }
2233
2234         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2235         if (!streq_ptr(pttype, "gpt")) {
2236                 log_error("Image %s does not carry a GUID Partition Table.\n"
2237                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2238                 return -EINVAL;
2239         }
2240
2241         errno = 0;
2242         pl = blkid_probe_get_partitions(b);
2243         if (!pl) {
2244                 if (errno == 0)
2245                         return log_oom();
2246
2247                 log_error("Failed to list partitions of %s", arg_image);
2248                 return -errno;
2249         }
2250
2251         udev = udev_new();
2252         if (!udev)
2253                 return log_oom();
2254
2255         if (fstat(fd, &st) < 0) {
2256                 log_error("Failed to stat block device: %m");
2257                 return -errno;
2258         }
2259
2260         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2261         if (!d)
2262                 return log_oom();
2263
2264         e = udev_enumerate_new(udev);
2265         if (!e)
2266                 return log_oom();
2267
2268         r = udev_enumerate_add_match_parent(e, d);
2269         if (r < 0)
2270                 return log_oom();
2271
2272         r = udev_enumerate_scan_devices(e);
2273         if (r < 0) {
2274                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2275                 return r;
2276         }
2277
2278         first = udev_enumerate_get_list_entry(e);
2279         udev_list_entry_foreach(item, first) {
2280                 _cleanup_udev_device_unref_ struct udev_device *q;
2281                 const char *stype, *node;
2282                 unsigned long long flags;
2283                 sd_id128_t type_id;
2284                 blkid_partition pp;
2285                 dev_t qn;
2286                 int nr;
2287
2288                 errno = 0;
2289                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2290                 if (!q) {
2291                         if (!errno)
2292                                 errno = ENOMEM;
2293
2294                         log_error("Failed to get partition device of %s: %m", arg_image);
2295                         return -errno;
2296                 }
2297
2298                 qn = udev_device_get_devnum(q);
2299                 if (major(qn) == 0)
2300                         continue;
2301
2302                 if (st.st_rdev == qn)
2303                         continue;
2304
2305                 node = udev_device_get_devnode(q);
2306                 if (!node)
2307                         continue;
2308
2309                 pp = blkid_partlist_devno_to_partition(pl, qn);
2310                 if (!pp)
2311                         continue;
2312
2313                 flags = blkid_partition_get_flags(pp);
2314                 if (flags & GPT_FLAG_NO_AUTO)
2315                         continue;
2316
2317                 nr = blkid_partition_get_partno(pp);
2318                 if (nr < 0)
2319                         continue;
2320
2321                 stype = blkid_partition_get_type_string(pp);
2322                 if (!stype)
2323                         continue;
2324
2325                 if (sd_id128_from_string(stype, &type_id) < 0)
2326                         continue;
2327
2328                 if (sd_id128_equal(type_id, GPT_HOME)) {
2329
2330                         if (home && nr >= home_nr)
2331                                 continue;
2332
2333                         home_nr = nr;
2334                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2335
2336                         free(home);
2337                         home = strdup(node);
2338                         if (!home)
2339                                 return log_oom();
2340                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2341
2342                         if (srv && nr >= srv_nr)
2343                                 continue;
2344
2345                         srv_nr = nr;
2346                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2347
2348                         free(srv);
2349                         srv = strdup(node);
2350                         if (!srv)
2351                                 return log_oom();
2352                 }
2353 #ifdef GPT_ROOT_NATIVE
2354                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2355
2356                         if (root && nr >= root_nr)
2357                                 continue;
2358
2359                         root_nr = nr;
2360                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2361
2362                         free(root);
2363                         root = strdup(node);
2364                         if (!root)
2365                                 return log_oom();
2366                 }
2367 #endif
2368 #ifdef GPT_ROOT_SECONDARY
2369                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2370
2371                         if (secondary_root && nr >= secondary_root_nr)
2372                                 continue;
2373
2374                         secondary_root_nr = nr;
2375                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2376
2377
2378                         free(secondary_root);
2379                         secondary_root = strdup(node);
2380                         if (!secondary_root)
2381                                 return log_oom();
2382                 }
2383 #endif
2384         }
2385
2386         if (!root && !secondary_root) {
2387                 log_error("Failed to identify root partition in disk image %s.\n"
2388                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2389                 return -EINVAL;
2390         }
2391
2392         if (root) {
2393                 *root_device = root;
2394                 root = NULL;
2395
2396                 *root_device_rw = root_rw;
2397                 *secondary = false;
2398         } else if (secondary_root) {
2399                 *root_device = secondary_root;
2400                 secondary_root = NULL;
2401
2402                 *root_device_rw = secondary_root_rw;
2403                 *secondary = true;
2404         }
2405
2406         if (home) {
2407                 *home_device = home;
2408                 home = NULL;
2409
2410                 *home_device_rw = home_rw;
2411         }
2412
2413         if (srv) {
2414                 *srv_device = srv;
2415                 srv = NULL;
2416
2417                 *srv_device_rw = srv_rw;
2418         }
2419
2420         return 0;
2421 #else
2422         log_error("--image= is not supported, compiled without blkid support.");
2423         return -ENOTSUP;
2424 #endif
2425 }
2426
2427 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2428 #ifdef HAVE_BLKID
2429         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2430         const char *fstype, *p;
2431         int r;
2432
2433         assert(what);
2434         assert(where);
2435
2436         if (arg_read_only)
2437                 rw = false;
2438
2439         if (directory)
2440                 p = strappenda(where, directory);
2441         else
2442                 p = where;
2443
2444         errno = 0;
2445         b = blkid_new_probe_from_filename(what);
2446         if (!b) {
2447                 if (errno == 0)
2448                         return log_oom();
2449                 log_error("Failed to allocate prober for %s: %m", what);
2450                 return -errno;
2451         }
2452
2453         blkid_probe_enable_superblocks(b, 1);
2454         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2455
2456         errno = 0;
2457         r = blkid_do_safeprobe(b);
2458         if (r == -1 || r == 1) {
2459                 log_error("Cannot determine file system type of %s", what);
2460                 return -EINVAL;
2461         } else if (r != 0) {
2462                 if (errno == 0)
2463                         errno = EIO;
2464                 log_error("Failed to probe %s: %m", what);
2465                 return -errno;
2466         }
2467
2468         errno = 0;
2469         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2470                 if (errno == 0)
2471                         errno = EINVAL;
2472                 log_error("Failed to determine file system type of %s", what);
2473                 return -errno;
2474         }
2475
2476         if (streq(fstype, "crypto_LUKS")) {
2477                 log_error("nspawn currently does not support LUKS disk images.");
2478                 return -ENOTSUP;
2479         }
2480
2481         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2482                 log_error("Failed to mount %s: %m", what);
2483                 return -errno;
2484         }
2485
2486         return 0;
2487 #else
2488         log_error("--image= is not supported, compiled without blkid support.");
2489         return -ENOTSUP;
2490 #endif
2491 }
2492
2493 static int mount_devices(
2494                 const char *where,
2495                 const char *root_device, bool root_device_rw,
2496                 const char *home_device, bool home_device_rw,
2497                 const char *srv_device, bool srv_device_rw) {
2498         int r;
2499
2500         assert(where);
2501
2502         if (root_device) {
2503                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2504                 if (r < 0) {
2505                         log_error("Failed to mount root directory: %s", strerror(-r));
2506                         return r;
2507                 }
2508         }
2509
2510         if (home_device) {
2511                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2512                 if (r < 0) {
2513                         log_error("Failed to mount home directory: %s", strerror(-r));
2514                         return r;
2515                 }
2516         }
2517
2518         if (srv_device) {
2519                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2520                 if (r < 0) {
2521                         log_error("Failed to mount server data directory: %s", strerror(-r));
2522                         return r;
2523                 }
2524         }
2525
2526         return 0;
2527 }
2528
2529 static void loop_remove(int nr, int *image_fd) {
2530         _cleanup_close_ int control = -1;
2531
2532         if (nr < 0)
2533                 return;
2534
2535         if (image_fd && *image_fd >= 0) {
2536                 ioctl(*image_fd, LOOP_CLR_FD);
2537                 *image_fd = safe_close(*image_fd);
2538         }
2539
2540         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2541         if (control < 0)
2542                 return;
2543
2544         ioctl(control, LOOP_CTL_REMOVE, nr);
2545 }
2546
2547 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2548         int pipe_fds[2];
2549         pid_t pid;
2550
2551         assert(database);
2552         assert(key);
2553         assert(rpid);
2554
2555         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2556                 log_error("Failed to allocate pipe: %m");
2557                 return -errno;
2558         }
2559
2560         pid = fork();
2561         if (pid < 0) {
2562                 log_error("Failed to fork getent child: %m");
2563                 return -errno;
2564         } else if (pid == 0) {
2565                 int nullfd;
2566                 char *empty_env = NULL;
2567
2568                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2569                         _exit(EXIT_FAILURE);
2570
2571                 if (pipe_fds[0] > 2)
2572                         safe_close(pipe_fds[0]);
2573                 if (pipe_fds[1] > 2)
2574                         safe_close(pipe_fds[1]);
2575
2576                 nullfd = open("/dev/null", O_RDWR);
2577                 if (nullfd < 0)
2578                         _exit(EXIT_FAILURE);
2579
2580                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2581                         _exit(EXIT_FAILURE);
2582
2583                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2584                         _exit(EXIT_FAILURE);
2585
2586                 if (nullfd > 2)
2587                         safe_close(nullfd);
2588
2589                 reset_all_signal_handlers();
2590                 close_all_fds(NULL, 0);
2591
2592                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2593                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2594                 _exit(EXIT_FAILURE);
2595         }
2596
2597         pipe_fds[1] = safe_close(pipe_fds[1]);
2598
2599         *rpid = pid;
2600
2601         return pipe_fds[0];
2602 }
2603
2604 static int change_uid_gid(char **_home) {
2605         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2606         _cleanup_free_ uid_t *uids = NULL;
2607         _cleanup_free_ char *home = NULL;
2608         _cleanup_fclose_ FILE *f = NULL;
2609         _cleanup_close_ int fd = -1;
2610         unsigned n_uids = 0;
2611         size_t sz = 0, l;
2612         uid_t uid;
2613         gid_t gid;
2614         pid_t pid;
2615         int r;
2616
2617         assert(_home);
2618
2619         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2620                 /* Reset everything fully to 0, just in case */
2621
2622                 if (setgroups(0, NULL) < 0) {
2623                         log_error("setgroups() failed: %m");
2624                         return -errno;
2625                 }
2626
2627                 if (setresgid(0, 0, 0) < 0) {
2628                         log_error("setregid() failed: %m");
2629                         return -errno;
2630                 }
2631
2632                 if (setresuid(0, 0, 0) < 0) {
2633                         log_error("setreuid() failed: %m");
2634                         return -errno;
2635                 }
2636
2637                 *_home = NULL;
2638                 return 0;
2639         }
2640
2641         /* First, get user credentials */
2642         fd = spawn_getent("passwd", arg_user, &pid);
2643         if (fd < 0)
2644                 return fd;
2645
2646         f = fdopen(fd, "r");
2647         if (!f)
2648                 return log_oom();
2649         fd = -1;
2650
2651         if (!fgets(line, sizeof(line), f)) {
2652
2653                 if (!ferror(f)) {
2654                         log_error("Failed to resolve user %s.", arg_user);
2655                         return -ESRCH;
2656                 }
2657
2658                 log_error("Failed to read from getent: %m");
2659                 return -errno;
2660         }
2661
2662         truncate_nl(line);
2663
2664         wait_for_terminate_and_warn("getent passwd", pid);
2665
2666         x = strchr(line, ':');
2667         if (!x) {
2668                 log_error("/etc/passwd entry has invalid user field.");
2669                 return -EIO;
2670         }
2671
2672         u = strchr(x+1, ':');
2673         if (!u) {
2674                 log_error("/etc/passwd entry has invalid password field.");
2675                 return -EIO;
2676         }
2677
2678         u++;
2679         g = strchr(u, ':');
2680         if (!g) {
2681                 log_error("/etc/passwd entry has invalid UID field.");
2682                 return -EIO;
2683         }
2684
2685         *g = 0;
2686         g++;
2687         x = strchr(g, ':');
2688         if (!x) {
2689                 log_error("/etc/passwd entry has invalid GID field.");
2690                 return -EIO;
2691         }
2692
2693         *x = 0;
2694         h = strchr(x+1, ':');
2695         if (!h) {
2696                 log_error("/etc/passwd entry has invalid GECOS field.");
2697                 return -EIO;
2698         }
2699
2700         h++;
2701         x = strchr(h, ':');
2702         if (!x) {
2703                 log_error("/etc/passwd entry has invalid home directory field.");
2704                 return -EIO;
2705         }
2706
2707         *x = 0;
2708
2709         r = parse_uid(u, &uid);
2710         if (r < 0) {
2711                 log_error("Failed to parse UID of user.");
2712                 return -EIO;
2713         }
2714
2715         r = parse_gid(g, &gid);
2716         if (r < 0) {
2717                 log_error("Failed to parse GID of user.");
2718                 return -EIO;
2719         }
2720
2721         home = strdup(h);
2722         if (!home)
2723                 return log_oom();
2724
2725         /* Second, get group memberships */
2726         fd = spawn_getent("initgroups", arg_user, &pid);
2727         if (fd < 0)
2728                 return fd;
2729
2730         fclose(f);
2731         f = fdopen(fd, "r");
2732         if (!f)
2733                 return log_oom();
2734         fd = -1;
2735
2736         if (!fgets(line, sizeof(line), f)) {
2737                 if (!ferror(f)) {
2738                         log_error("Failed to resolve user %s.", arg_user);
2739                         return -ESRCH;
2740                 }
2741
2742                 log_error("Failed to read from getent: %m");
2743                 return -errno;
2744         }
2745
2746         truncate_nl(line);
2747
2748         wait_for_terminate_and_warn("getent initgroups", pid);
2749
2750         /* Skip over the username and subsequent separator whitespace */
2751         x = line;
2752         x += strcspn(x, WHITESPACE);
2753         x += strspn(x, WHITESPACE);
2754
2755         FOREACH_WORD(w, l, x, state) {
2756                 char c[l+1];
2757
2758                 memcpy(c, w, l);
2759                 c[l] = 0;
2760
2761                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2762                         return log_oom();
2763
2764                 r = parse_uid(c, &uids[n_uids++]);
2765                 if (r < 0) {
2766                         log_error("Failed to parse group data from getent.");
2767                         return -EIO;
2768                 }
2769         }
2770
2771         r = mkdir_parents(home, 0775);
2772         if (r < 0) {
2773                 log_error("Failed to make home root directory: %s", strerror(-r));
2774                 return r;
2775         }
2776
2777         r = mkdir_safe(home, 0755, uid, gid);
2778         if (r < 0 && r != -EEXIST) {
2779                 log_error("Failed to make home directory: %s", strerror(-r));
2780                 return r;
2781         }
2782
2783         fchown(STDIN_FILENO, uid, gid);
2784         fchown(STDOUT_FILENO, uid, gid);
2785         fchown(STDERR_FILENO, uid, gid);
2786
2787         if (setgroups(n_uids, uids) < 0) {
2788                 log_error("Failed to set auxiliary groups: %m");
2789                 return -errno;
2790         }
2791
2792         if (setresgid(gid, gid, gid) < 0) {
2793                 log_error("setregid() failed: %m");
2794                 return -errno;
2795         }
2796
2797         if (setresuid(uid, uid, uid) < 0) {
2798                 log_error("setreuid() failed: %m");
2799                 return -errno;
2800         }
2801
2802         if (_home) {
2803                 *_home = home;
2804                 home = NULL;
2805         }
2806
2807         return 0;
2808 }
2809
2810 /*
2811  * Return values:
2812  * < 0 : wait_for_terminate() failed to get the state of the
2813  *       container, the container was terminated by a signal, or
2814  *       failed for an unknown reason.  No change is made to the
2815  *       container argument.
2816  * > 0 : The program executed in the container terminated with an
2817  *       error.  The exit code of the program executed in the
2818  *       container is returned.  No change is made to the container
2819  *       argument.
2820  *   0 : The container is being rebooted, has been shut down or exited
2821  *       successfully.  The container argument has been set to either
2822  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2823  *
2824  * That is, success is indicated by a return value of zero, and an
2825  * error is indicated by a non-zero value.
2826  */
2827 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2828         int r;
2829         siginfo_t status;
2830
2831         r = wait_for_terminate(pid, &status);
2832         if (r < 0) {
2833                 log_warning("Failed to wait for container: %s", strerror(-r));
2834                 return r;
2835         }
2836
2837         switch (status.si_code) {
2838         case CLD_EXITED:
2839                 r = status.si_status;
2840                 if (r == 0) {
2841                         if (!arg_quiet)
2842                                 log_debug("Container %s exited successfully.",
2843                                           arg_machine);
2844
2845                         *container = CONTAINER_TERMINATED;
2846                 } else {
2847                         log_error("Container %s failed with error code %i.",
2848                                   arg_machine, status.si_status);
2849                 }
2850                 break;
2851
2852         case CLD_KILLED:
2853                 if (status.si_status == SIGINT) {
2854                         if (!arg_quiet)
2855                                 log_info("Container %s has been shut down.",
2856                                          arg_machine);
2857
2858                         *container = CONTAINER_TERMINATED;
2859                         r = 0;
2860                         break;
2861                 } else if (status.si_status == SIGHUP) {
2862                         if (!arg_quiet)
2863                                 log_info("Container %s is being rebooted.",
2864                                          arg_machine);
2865
2866                         *container = CONTAINER_REBOOTED;
2867                         r = 0;
2868                         break;
2869                 }
2870                 /* CLD_KILLED fallthrough */
2871
2872         case CLD_DUMPED:
2873                 log_error("Container %s terminated by signal %s.",
2874                           arg_machine, signal_to_string(status.si_status));
2875                 r = -1;
2876                 break;
2877
2878         default:
2879                 log_error("Container %s failed due to unknown reason.",
2880                           arg_machine);
2881                 r = -1;
2882                 break;
2883         }
2884
2885         return r;
2886 }
2887
2888 static void nop_handler(int sig) {}
2889
2890 int main(int argc, char *argv[]) {
2891
2892         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2893         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2894         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2895         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2896         _cleanup_fdset_free_ FDSet *fds = NULL;
2897         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2898         const char *console = NULL;
2899         char veth_name[IFNAMSIZ];
2900         bool secondary = false;
2901         sigset_t mask, mask_chld;
2902         pid_t pid = 0;
2903
2904         log_parse_environment();
2905         log_open();
2906
2907         k = parse_argv(argc, argv);
2908         if (k < 0)
2909                 goto finish;
2910         else if (k == 0) {
2911                 r = EXIT_SUCCESS;
2912                 goto finish;
2913         }
2914
2915         if (!arg_image) {
2916                 if (arg_directory) {
2917                         char *p;
2918
2919                         p = path_make_absolute_cwd(arg_directory);
2920                         free(arg_directory);
2921                         arg_directory = p;
2922                 } else
2923                         arg_directory = get_current_dir_name();
2924
2925                 if (!arg_directory) {
2926                         log_error("Failed to determine path, please use -D.");
2927                         goto finish;
2928                 }
2929                 path_kill_slashes(arg_directory);
2930         }
2931
2932         if (!arg_machine) {
2933                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2934                 if (!arg_machine) {
2935                         log_oom();
2936                         goto finish;
2937                 }
2938
2939                 hostname_cleanup(arg_machine, false);
2940                 if (isempty(arg_machine)) {
2941                         log_error("Failed to determine machine name automatically, please use -M.");
2942                         goto finish;
2943                 }
2944         }
2945
2946         if (geteuid() != 0) {
2947                 log_error("Need to be root.");
2948                 goto finish;
2949         }
2950
2951         if (sd_booted() <= 0) {
2952                 log_error("Not running on a systemd system.");
2953                 goto finish;
2954         }
2955
2956         log_close();
2957         n_fd_passed = sd_listen_fds(false);
2958         if (n_fd_passed > 0) {
2959                 k = fdset_new_listen_fds(&fds, false);
2960                 if (k < 0) {
2961                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2962                         goto finish;
2963                 }
2964         }
2965         fdset_close_others(fds);
2966         log_open();
2967
2968         if (arg_directory) {
2969                 if (path_equal(arg_directory, "/")) {
2970                         log_error("Spawning container on root directory not supported.");
2971                         goto finish;
2972                 }
2973
2974                 if (arg_boot) {
2975                         if (path_is_os_tree(arg_directory) <= 0) {
2976                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2977                                 goto finish;
2978                         }
2979                 } else {
2980                         const char *p;
2981
2982                         p = strappenda(arg_directory,
2983                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2984                         if (access(p, F_OK) < 0) {
2985                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2986                                 goto finish;
2987
2988                         }
2989                 }
2990         } else {
2991                 char template[] = "/tmp/nspawn-root-XXXXXX";
2992
2993                 if (!mkdtemp(template)) {
2994                         log_error("Failed to create temporary directory: %m");
2995                         r = -errno;
2996                         goto finish;
2997                 }
2998
2999                 arg_directory = strdup(template);
3000                 if (!arg_directory) {
3001                         r = log_oom();
3002                         goto finish;
3003                 }
3004
3005                 image_fd = setup_image(&device_path, &loop_nr);
3006                 if (image_fd < 0) {
3007                         r = image_fd;
3008                         goto finish;
3009                 }
3010
3011                 r = dissect_image(image_fd,
3012                                   &root_device, &root_device_rw,
3013                                   &home_device, &home_device_rw,
3014                                   &srv_device, &srv_device_rw,
3015                                   &secondary);
3016                 if (r < 0)
3017                         goto finish;
3018         }
3019
3020         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3021         if (master < 0) {
3022                 log_error("Failed to acquire pseudo tty: %m");
3023                 goto finish;
3024         }
3025
3026         console = ptsname(master);
3027         if (!console) {
3028                 log_error("Failed to determine tty name: %m");
3029                 goto finish;
3030         }
3031
3032         if (!arg_quiet)
3033                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3034                          arg_machine, arg_image ? arg_image : arg_directory);
3035
3036         if (unlockpt(master) < 0) {
3037                 log_error("Failed to unlock tty: %m");
3038                 goto finish;
3039         }
3040
3041         if (access("/dev/kdbus/control", F_OK) >= 0) {
3042
3043                 if (arg_share_system) {
3044                         kdbus_domain = strdup("/dev/kdbus");
3045                         if (!kdbus_domain) {
3046                                 log_oom();
3047                                 goto finish;
3048                         }
3049                 } else {
3050                         const char *ns;
3051
3052                         ns = strappenda("machine-", arg_machine);
3053                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3054                         if (r < 0)
3055                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3056                         else
3057                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3058                 }
3059         }
3060
3061         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3062                 log_error("Failed to create kmsg socket pair: %m");
3063                 goto finish;
3064         }
3065
3066         sd_notify(0, "READY=1");
3067
3068         assert_se(sigemptyset(&mask) == 0);
3069         assert_se(sigemptyset(&mask_chld) == 0);
3070         sigaddset(&mask_chld, SIGCHLD);
3071         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3072         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3073
3074         for (;;) {
3075                 ContainerStatus container_status;
3076                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3077                 struct sigaction sa = {
3078                         .sa_handler = nop_handler,
3079                         .sa_flags = SA_NOCLDSTOP,
3080                 };
3081
3082                 r = barrier_create(&barrier);
3083                 if (r < 0) {
3084                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3085                         goto finish;
3086                 }
3087
3088                 /* Child can be killed before execv(), so handle SIGCHLD
3089                  * in order to interrupt parent's blocking calls and
3090                  * give it a chance to call wait() and terminate. */
3091                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3092                 if (r < 0) {
3093                         log_error("Failed to change the signal mask: %m");
3094                         goto finish;
3095                 }
3096
3097                 r = sigaction(SIGCHLD, &sa, NULL);
3098                 if (r < 0) {
3099                         log_error("Failed to install SIGCHLD handler: %m");
3100                         goto finish;
3101                 }
3102
3103                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3104                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3105                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3106                 if (pid < 0) {
3107                         if (errno == EINVAL)
3108                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3109                         else
3110                                 log_error("clone() failed: %m");
3111
3112                         r = pid;
3113                         goto finish;
3114                 }
3115
3116                 if (pid == 0) {
3117                         /* child */
3118                         _cleanup_free_ char *home = NULL;
3119                         unsigned n_env = 2;
3120                         const char *envp[] = {
3121                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3122                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3123                                 NULL, /* TERM */
3124                                 NULL, /* HOME */
3125                                 NULL, /* USER */
3126                                 NULL, /* LOGNAME */
3127                                 NULL, /* container_uuid */
3128                                 NULL, /* LISTEN_FDS */
3129                                 NULL, /* LISTEN_PID */
3130                                 NULL
3131                         };
3132                         char **env_use;
3133
3134                         barrier_set_role(&barrier, BARRIER_CHILD);
3135
3136                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3137                         if (envp[n_env])
3138                                 n_env ++;
3139
3140                         master = safe_close(master);
3141
3142                         close_nointr(STDIN_FILENO);
3143                         close_nointr(STDOUT_FILENO);
3144                         close_nointr(STDERR_FILENO);
3145
3146                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3147
3148                         reset_all_signal_handlers();
3149
3150                         assert_se(sigemptyset(&mask) == 0);
3151                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3152
3153                         k = open_terminal(console, O_RDWR);
3154                         if (k != STDIN_FILENO) {
3155                                 if (k >= 0) {
3156                                         safe_close(k);
3157                                         k = -EINVAL;
3158                                 }
3159
3160                                 log_error("Failed to open console: %s", strerror(-k));
3161                                 _exit(EXIT_FAILURE);
3162                         }
3163
3164                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3165                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3166                                 log_error("Failed to duplicate console: %m");
3167                                 _exit(EXIT_FAILURE);
3168                         }
3169
3170                         if (setsid() < 0) {
3171                                 log_error("setsid() failed: %m");
3172                                 _exit(EXIT_FAILURE);
3173                         }
3174
3175                         if (reset_audit_loginuid() < 0)
3176                                 _exit(EXIT_FAILURE);
3177
3178                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3179                                 log_error("PR_SET_PDEATHSIG failed: %m");
3180                                 _exit(EXIT_FAILURE);
3181                         }
3182
3183                         /* Mark everything as slave, so that we still
3184                          * receive mounts from the real root, but don't
3185                          * propagate mounts to the real root. */
3186                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3187                                 log_error("MS_SLAVE|MS_REC failed: %m");
3188                                 _exit(EXIT_FAILURE);
3189                         }
3190
3191                         if (mount_devices(arg_directory,
3192                                           root_device, root_device_rw,
3193                                           home_device, home_device_rw,
3194                                           srv_device, srv_device_rw) < 0)
3195                                 _exit(EXIT_FAILURE);
3196
3197                         /* Turn directory into bind mount */
3198                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3199                                 log_error("Failed to make bind mount: %m");
3200                                 _exit(EXIT_FAILURE);
3201                         }
3202
3203                         r = setup_volatile(arg_directory);
3204                         if (r < 0)
3205                                 _exit(EXIT_FAILURE);
3206
3207                         if (setup_volatile_state(arg_directory) < 0)
3208                                 _exit(EXIT_FAILURE);
3209
3210                         r = base_filesystem_create(arg_directory);
3211                         if (r < 0)
3212                                 _exit(EXIT_FAILURE);
3213
3214                         if (arg_read_only) {
3215                                 k = bind_remount_recursive(arg_directory, true);
3216                                 if (k < 0) {
3217                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3218                                         _exit(EXIT_FAILURE);
3219                                 }
3220                         }
3221
3222                         if (mount_all(arg_directory) < 0)
3223                                 _exit(EXIT_FAILURE);
3224
3225                         if (copy_devnodes(arg_directory) < 0)
3226                                 _exit(EXIT_FAILURE);
3227
3228                         if (setup_ptmx(arg_directory) < 0)
3229                                 _exit(EXIT_FAILURE);
3230
3231                         dev_setup(arg_directory);
3232
3233                         if (setup_seccomp() < 0)
3234                                 _exit(EXIT_FAILURE);
3235
3236                         if (setup_dev_console(arg_directory, console) < 0)
3237                                 _exit(EXIT_FAILURE);
3238
3239                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3240                                 _exit(EXIT_FAILURE);
3241
3242                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3243
3244                         if (setup_boot_id(arg_directory) < 0)
3245                                 _exit(EXIT_FAILURE);
3246
3247                         if (setup_timezone(arg_directory) < 0)
3248                                 _exit(EXIT_FAILURE);
3249
3250                         if (setup_resolv_conf(arg_directory) < 0)
3251                                 _exit(EXIT_FAILURE);
3252
3253                         if (setup_journal(arg_directory) < 0)
3254                                 _exit(EXIT_FAILURE);
3255
3256                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3257                                 _exit(EXIT_FAILURE);
3258
3259                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3260                                 _exit(EXIT_FAILURE);
3261
3262                         if (mount_tmpfs(arg_directory) < 0)
3263                                 _exit(EXIT_FAILURE);
3264
3265                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3266                                 _exit(EXIT_FAILURE);
3267
3268                         /* Tell the parent that we are ready, and that
3269                          * it can cgroupify us to that we lack access
3270                          * to certain devices and resources. */
3271                         barrier_place(&barrier);
3272
3273                         if (chdir(arg_directory) < 0) {
3274                                 log_error("chdir(%s) failed: %m", arg_directory);
3275                                 _exit(EXIT_FAILURE);
3276                         }
3277
3278                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3279                                 log_error("mount(MS_MOVE) failed: %m");
3280                                 _exit(EXIT_FAILURE);
3281                         }
3282
3283                         if (chroot(".") < 0) {
3284                                 log_error("chroot() failed: %m");
3285                                 _exit(EXIT_FAILURE);
3286                         }
3287
3288                         if (chdir("/") < 0) {
3289                                 log_error("chdir() failed: %m");
3290                                 _exit(EXIT_FAILURE);
3291                         }
3292
3293                         umask(0022);
3294
3295                         if (arg_private_network)
3296                                 loopback_setup();
3297
3298                         if (drop_capabilities() < 0) {
3299                                 log_error("drop_capabilities() failed: %m");
3300                                 _exit(EXIT_FAILURE);
3301                         }
3302
3303                         r = change_uid_gid(&home);
3304                         if (r < 0)
3305                                 _exit(EXIT_FAILURE);
3306
3307                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3308                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3309                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3310                                 log_oom();
3311                                 _exit(EXIT_FAILURE);
3312                         }
3313
3314                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3315                                 char as_uuid[37];
3316
3317                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3318                                         log_oom();
3319                                         _exit(EXIT_FAILURE);
3320                                 }
3321                         }
3322
3323                         if (fdset_size(fds) > 0) {
3324                                 k = fdset_cloexec(fds, false);
3325                                 if (k < 0) {
3326                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3327                                         _exit(EXIT_FAILURE);
3328                                 }
3329
3330                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3331                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3332                                         log_oom();
3333                                         _exit(EXIT_FAILURE);
3334                                 }
3335                         }
3336
3337                         setup_hostname();
3338
3339                         if (arg_personality != 0xffffffffLU) {
3340                                 if (personality(arg_personality) < 0) {
3341                                         log_error("personality() failed: %m");
3342                                         _exit(EXIT_FAILURE);
3343                                 }
3344                         } else if (secondary) {
3345                                 if (personality(PER_LINUX32) < 0) {
3346                                         log_error("personality() failed: %m");
3347                                         _exit(EXIT_FAILURE);
3348                                 }
3349                         }
3350
3351 #ifdef HAVE_SELINUX
3352                         if (arg_selinux_context)
3353                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3354                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3355                                         _exit(EXIT_FAILURE);
3356                                 }
3357 #endif
3358
3359                         if (!strv_isempty(arg_setenv)) {
3360                                 char **n;
3361
3362                                 n = strv_env_merge(2, envp, arg_setenv);
3363                                 if (!n) {
3364                                         log_oom();
3365                                         _exit(EXIT_FAILURE);
3366                                 }
3367
3368                                 env_use = n;
3369                         } else
3370                                 env_use = (char**) envp;
3371
3372                         /* Wait until the parent is ready with the setup, too... */
3373                         if (!barrier_place_and_sync(&barrier))
3374                                 _exit(EXIT_FAILURE);
3375
3376                         if (arg_boot) {
3377                                 char **a;
3378                                 size_t l;
3379
3380                                 /* Automatically search for the init system */
3381
3382                                 l = 1 + argc - optind;
3383                                 a = newa(char*, l + 1);
3384                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3385
3386                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3387                                 execve(a[0], a, env_use);
3388
3389                                 a[0] = (char*) "/lib/systemd/systemd";
3390                                 execve(a[0], a, env_use);
3391
3392                                 a[0] = (char*) "/sbin/init";
3393                                 execve(a[0], a, env_use);
3394                         } else if (argc > optind)
3395                                 execvpe(argv[optind], argv + optind, env_use);
3396                         else {
3397                                 chdir(home ? home : "/root");
3398                                 execle("/bin/bash", "-bash", NULL, env_use);
3399                                 execle("/bin/sh", "-sh", NULL, env_use);
3400                         }
3401
3402                         log_error("execv() failed: %m");
3403                         _exit(EXIT_FAILURE);
3404                 }
3405
3406                 barrier_set_role(&barrier, BARRIER_PARENT);
3407                 fdset_free(fds);
3408                 fds = NULL;
3409
3410                 /* wait for child-setup to be done */
3411                 if (barrier_place_and_sync(&barrier)) {
3412                         int ifi = 0;
3413
3414                         r = move_network_interfaces(pid);
3415                         if (r < 0)
3416                                 goto finish;
3417
3418                         r = setup_veth(pid, veth_name, &ifi);
3419                         if (r < 0)
3420                                 goto finish;
3421
3422                         r = setup_bridge(veth_name, &ifi);
3423                         if (r < 0)
3424                                 goto finish;
3425
3426                         r = setup_macvlan(pid);
3427                         if (r < 0)
3428                                 goto finish;
3429
3430                         r = register_machine(pid, ifi);
3431                         if (r < 0)
3432                                 goto finish;
3433
3434                         /* Block SIGCHLD here, before notifying child.
3435                          * process_pty() will handle it with the other signals. */
3436                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3437                         if (r < 0)
3438                                 goto finish;
3439
3440                         /* Reset signal to default */
3441                         r = default_signals(SIGCHLD, -1);
3442                         if (r < 0)
3443                                 goto finish;
3444
3445                         /* Notify the child that the parent is ready with all
3446                          * its setup, and that the child can now hand over
3447                          * control to the code to run inside the container. */
3448                         barrier_place(&barrier);
3449
3450                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3451                         if (k < 0) {
3452                                 r = EXIT_FAILURE;
3453                                 break;
3454                         }
3455
3456                         if (!arg_quiet)
3457                                 putc('\n', stdout);
3458
3459                         /* Kill if it is not dead yet anyway */
3460                         terminate_machine(pid);
3461                 }
3462
3463                 /* Normally redundant, but better safe than sorry */
3464                 kill(pid, SIGKILL);
3465
3466                 r = wait_for_container(pid, &container_status);
3467                 pid = 0;
3468
3469                 if (r < 0) {
3470                         /* We failed to wait for the container, or the
3471                          * container exited abnormally */
3472                         r = EXIT_FAILURE;
3473                         break;
3474                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3475                         /* The container exited with a non-zero
3476                          * status, or with zero status and no reboot
3477                          * was requested. */
3478                         break;
3479
3480                 /* CONTAINER_REBOOTED, loop again */
3481
3482                 if (arg_keep_unit) {
3483                         /* Special handling if we are running as a
3484                          * service: instead of simply restarting the
3485                          * machine we want to restart the entire
3486                          * service, so let's inform systemd about this
3487                          * with the special exit code 133. The service
3488                          * file uses RestartForceExitStatus=133 so
3489                          * that this results in a full nspawn
3490                          * restart. This is necessary since we might
3491                          * have cgroup parameters set we want to have
3492                          * flushed out. */
3493                         r = 133;
3494                         break;
3495                 }
3496         }
3497
3498 finish:
3499         loop_remove(loop_nr, &image_fd);
3500
3501         if (pid > 0)
3502                 kill(pid, SIGKILL);
3503
3504         free(arg_directory);
3505         free(arg_machine);
3506         free(arg_user);
3507         strv_free(arg_setenv);
3508         strv_free(arg_network_interfaces);
3509         strv_free(arg_network_macvlan);
3510         strv_free(arg_bind);
3511         strv_free(arg_bind_ro);
3512         strv_free(arg_tmpfs);
3513
3514         return r;
3515 }