chiark / gitweb /
b118c739e8c6b0f562b2daf14520d15443d420e4
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static int help(void) {
170
171         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
172                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
173                "  -h --help                 Show this help\n"
174                "     --version              Print version string\n"
175                "  -q --quiet                Do not show status information\n"
176                "  -D --directory=PATH       Root directory for the container\n"
177                "  -i --image=PATH           File system device or image for the container\n"
178                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
179                "  -u --user=USER            Run the command under specified user or uid\n"
180                "  -M --machine=NAME         Set the machine name for the container\n"
181                "     --uuid=UUID            Set a specific machine UUID for the container\n"
182                "  -S --slice=SLICE          Place the container in the specified slice\n"
183                "     --private-network      Disable network in container\n"
184                "     --network-interface=INTERFACE\n"
185                "                            Assign an existing network interface to the\n"
186                "                            container\n"
187                "     --network-macvlan=INTERFACE\n"
188                "                            Create a macvlan network interface based on an\n"
189                "                            existing network interface to the container\n"
190                "     --network-veth         Add a virtual ethernet connection between host\n"
191                "                            and container\n"
192                "     --network-bridge=INTERFACE\n"
193                "                            Add a virtual ethernet connection between host\n"
194                "                            and container and add it to an existing bridge on\n"
195                "                            the host\n"
196                "  -Z --selinux-context=SECLABEL\n"
197                "                            Set the SELinux security context to be used by\n"
198                "                            processes in the container\n"
199                "  -L --selinux-apifs-context=SECLABEL\n"
200                "                            Set the SELinux security context to be used by\n"
201                "                            API/tmpfs file systems in the container\n"
202                "     --capability=CAP       In addition to the default, retain specified\n"
203                "                            capability\n"
204                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
205                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
206                "  -j                        Equivalent to --link-journal=host\n"
207                "     --read-only            Mount the root directory read-only\n"
208                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
209                "                            the container\n"
210                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
211                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
212                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
213                "     --share-system         Share system namespaces with host\n"
214                "     --register=BOOLEAN     Register container as machine\n"
215                "     --keep-unit            Do not register a scope for the machine, reuse\n"
216                "                            the service unit nspawn is running in\n"
217                "     --volatile[=MODE]      Run the system in volatile mode\n",
218                program_invocation_short_name);
219
220         return 0;
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225         enum {
226                 ARG_VERSION = 0x100,
227                 ARG_PRIVATE_NETWORK,
228                 ARG_UUID,
229                 ARG_READ_ONLY,
230                 ARG_CAPABILITY,
231                 ARG_DROP_CAPABILITY,
232                 ARG_LINK_JOURNAL,
233                 ARG_BIND,
234                 ARG_BIND_RO,
235                 ARG_TMPFS,
236                 ARG_SETENV,
237                 ARG_SHARE_SYSTEM,
238                 ARG_REGISTER,
239                 ARG_KEEP_UNIT,
240                 ARG_NETWORK_INTERFACE,
241                 ARG_NETWORK_MACVLAN,
242                 ARG_NETWORK_VETH,
243                 ARG_NETWORK_BRIDGE,
244                 ARG_PERSONALITY,
245                 ARG_VOLATILE,
246         };
247
248         static const struct option options[] = {
249                 { "help",                  no_argument,       NULL, 'h'                   },
250                 { "version",               no_argument,       NULL, ARG_VERSION           },
251                 { "directory",             required_argument, NULL, 'D'                   },
252                 { "user",                  required_argument, NULL, 'u'                   },
253                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
254                 { "boot",                  no_argument,       NULL, 'b'                   },
255                 { "uuid",                  required_argument, NULL, ARG_UUID              },
256                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
257                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
258                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
259                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
260                 { "bind",                  required_argument, NULL, ARG_BIND              },
261                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
262                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
263                 { "machine",               required_argument, NULL, 'M'                   },
264                 { "slice",                 required_argument, NULL, 'S'                   },
265                 { "setenv",                required_argument, NULL, ARG_SETENV            },
266                 { "selinux-context",       required_argument, NULL, 'Z'                   },
267                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
268                 { "quiet",                 no_argument,       NULL, 'q'                   },
269                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
270                 { "register",              required_argument, NULL, ARG_REGISTER          },
271                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
272                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
273                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
274                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
275                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
276                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
277                 { "image",                 required_argument, NULL, 'i'                   },
278                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
279                 {}
280         };
281
282         int c, r;
283         uint64_t plus = 0, minus = 0;
284
285         assert(argc >= 0);
286         assert(argv);
287
288         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
289
290                 switch (c) {
291
292                 case 'h':
293                         return help();
294
295                 case ARG_VERSION:
296                         puts(PACKAGE_STRING);
297                         puts(SYSTEMD_FEATURES);
298                         return 0;
299
300                 case 'D':
301                         free(arg_directory);
302                         arg_directory = canonicalize_file_name(optarg);
303                         if (!arg_directory) {
304                                 log_error("Invalid root directory: %m");
305                                 return -ENOMEM;
306                         }
307
308                         break;
309
310                 case 'i':
311                         arg_image = optarg;
312                         break;
313
314                 case 'u':
315                         free(arg_user);
316                         arg_user = strdup(optarg);
317                         if (!arg_user)
318                                 return log_oom();
319
320                         break;
321
322                 case ARG_NETWORK_BRIDGE:
323                         arg_network_bridge = optarg;
324
325                         /* fall through */
326
327                 case ARG_NETWORK_VETH:
328                         arg_network_veth = true;
329                         arg_private_network = true;
330                         break;
331
332                 case ARG_NETWORK_INTERFACE:
333                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
334                                 return log_oom();
335
336                         arg_private_network = true;
337                         break;
338
339                 case ARG_NETWORK_MACVLAN:
340                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
341                                 return log_oom();
342
343                         /* fall through */
344
345                 case ARG_PRIVATE_NETWORK:
346                         arg_private_network = true;
347                         break;
348
349                 case 'b':
350                         arg_boot = true;
351                         break;
352
353                 case ARG_UUID:
354                         r = sd_id128_from_string(optarg, &arg_uuid);
355                         if (r < 0) {
356                                 log_error("Invalid UUID: %s", optarg);
357                                 return r;
358                         }
359                         break;
360
361                 case 'S':
362                         arg_slice = optarg;
363                         break;
364
365                 case 'M':
366                         if (isempty(optarg)) {
367                                 free(arg_machine);
368                                 arg_machine = NULL;
369                         } else {
370
371                                 if (!hostname_is_valid(optarg)) {
372                                         log_error("Invalid machine name: %s", optarg);
373                                         return -EINVAL;
374                                 }
375
376                                 free(arg_machine);
377                                 arg_machine = strdup(optarg);
378                                 if (!arg_machine)
379                                         return log_oom();
380
381                                 break;
382                         }
383
384                 case 'Z':
385                         arg_selinux_context = optarg;
386                         break;
387
388                 case 'L':
389                         arg_selinux_apifs_context = optarg;
390                         break;
391
392                 case ARG_READ_ONLY:
393                         arg_read_only = true;
394                         break;
395
396                 case ARG_CAPABILITY:
397                 case ARG_DROP_CAPABILITY: {
398                         const char *state, *word;
399                         size_t length;
400
401                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
402                                 _cleanup_free_ char *t;
403                                 cap_value_t cap;
404
405                                 t = strndup(word, length);
406                                 if (!t)
407                                         return log_oom();
408
409                                 if (streq(t, "all")) {
410                                         if (c == ARG_CAPABILITY)
411                                                 plus = (uint64_t) -1;
412                                         else
413                                                 minus = (uint64_t) -1;
414                                 } else {
415                                         if (cap_from_name(t, &cap) < 0) {
416                                                 log_error("Failed to parse capability %s.", t);
417                                                 return -EINVAL;
418                                         }
419
420                                         if (c == ARG_CAPABILITY)
421                                                 plus |= 1ULL << (uint64_t) cap;
422                                         else
423                                                 minus |= 1ULL << (uint64_t) cap;
424                                 }
425                         }
426
427                         break;
428                 }
429
430                 case 'j':
431                         arg_link_journal = LINK_GUEST;
432                         break;
433
434                 case ARG_LINK_JOURNAL:
435                         if (streq(optarg, "auto"))
436                                 arg_link_journal = LINK_AUTO;
437                         else if (streq(optarg, "no"))
438                                 arg_link_journal = LINK_NO;
439                         else if (streq(optarg, "guest"))
440                                 arg_link_journal = LINK_GUEST;
441                         else if (streq(optarg, "host"))
442                                 arg_link_journal = LINK_HOST;
443                         else {
444                                 log_error("Failed to parse link journal mode %s", optarg);
445                                 return -EINVAL;
446                         }
447
448                         break;
449
450                 case ARG_BIND:
451                 case ARG_BIND_RO: {
452                         _cleanup_free_ char *a = NULL, *b = NULL;
453                         char *e;
454                         char ***x;
455
456                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
457
458                         e = strchr(optarg, ':');
459                         if (e) {
460                                 a = strndup(optarg, e - optarg);
461                                 b = strdup(e + 1);
462                         } else {
463                                 a = strdup(optarg);
464                                 b = strdup(optarg);
465                         }
466
467                         if (!a || !b)
468                                 return log_oom();
469
470                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
471                                 log_error("Invalid bind mount specification: %s", optarg);
472                                 return -EINVAL;
473                         }
474
475                         r = strv_extend(x, a);
476                         if (r < 0)
477                                 return log_oom();
478
479                         r = strv_extend(x, b);
480                         if (r < 0)
481                                 return log_oom();
482
483                         break;
484                 }
485
486                 case ARG_TMPFS: {
487                         _cleanup_free_ char *a = NULL, *b = NULL;
488                         char *e;
489
490                         e = strchr(optarg, ':');
491                         if (e) {
492                                 a = strndup(optarg, e - optarg);
493                                 b = strdup(e + 1);
494                         } else {
495                                 a = strdup(optarg);
496                                 b = strdup("mode=0755");
497                         }
498
499                         if (!a || !b)
500                                 return log_oom();
501
502                         if (!path_is_absolute(a)) {
503                                 log_error("Invalid tmpfs specification: %s", optarg);
504                                 return -EINVAL;
505                         }
506
507                         r = strv_push(&arg_tmpfs, a);
508                         if (r < 0)
509                                 return log_oom();
510
511                         a = NULL;
512
513                         r = strv_push(&arg_tmpfs, b);
514                         if (r < 0)
515                                 return log_oom();
516
517                         b = NULL;
518
519                         break;
520                 }
521
522                 case ARG_SETENV: {
523                         char **n;
524
525                         if (!env_assignment_is_valid(optarg)) {
526                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
527                                 return -EINVAL;
528                         }
529
530                         n = strv_env_set(arg_setenv, optarg);
531                         if (!n)
532                                 return log_oom();
533
534                         strv_free(arg_setenv);
535                         arg_setenv = n;
536                         break;
537                 }
538
539                 case 'q':
540                         arg_quiet = true;
541                         break;
542
543                 case ARG_SHARE_SYSTEM:
544                         arg_share_system = true;
545                         break;
546
547                 case ARG_REGISTER:
548                         r = parse_boolean(optarg);
549                         if (r < 0) {
550                                 log_error("Failed to parse --register= argument: %s", optarg);
551                                 return r;
552                         }
553
554                         arg_register = r;
555                         break;
556
557                 case ARG_KEEP_UNIT:
558                         arg_keep_unit = true;
559                         break;
560
561                 case ARG_PERSONALITY:
562
563                         arg_personality = personality_from_string(optarg);
564                         if (arg_personality == 0xffffffffLU) {
565                                 log_error("Unknown or unsupported personality '%s'.", optarg);
566                                 return -EINVAL;
567                         }
568
569                         break;
570
571                 case ARG_VOLATILE:
572
573                         if (!optarg)
574                                 arg_volatile = VOLATILE_YES;
575                         else {
576                                 r = parse_boolean(optarg);
577                                 if (r < 0) {
578                                         if (streq(optarg, "state"))
579                                                 arg_volatile = VOLATILE_STATE;
580                                         else {
581                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
582                                                 return r;
583                                         }
584                                 } else
585                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
586                         }
587
588                         break;
589
590                 case '?':
591                         return -EINVAL;
592
593                 default:
594                         assert_not_reached("Unhandled option");
595                 }
596         }
597
598         if (arg_share_system)
599                 arg_register = false;
600
601         if (arg_boot && arg_share_system) {
602                 log_error("--boot and --share-system may not be combined.");
603                 return -EINVAL;
604         }
605
606         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
607                 log_error("--keep-unit may not be used when invoked from a user session.");
608                 return -EINVAL;
609         }
610
611         if (arg_directory && arg_image) {
612                 log_error("--directory= and --image= may not be combined.");
613                 return -EINVAL;
614         }
615
616         if (arg_volatile != VOLATILE_NO && arg_read_only) {
617                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
618                 return -EINVAL;
619         }
620
621         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
622
623         return 1;
624 }
625
626 static int mount_all(const char *dest) {
627
628         typedef struct MountPoint {
629                 const char *what;
630                 const char *where;
631                 const char *type;
632                 const char *options;
633                 unsigned long flags;
634                 bool fatal;
635         } MountPoint;
636
637         static const MountPoint mount_table[] = {
638                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
639                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
640                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
641                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
642                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
643                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
644                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
645                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
646 #ifdef HAVE_SELINUX
647                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
648                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
649 #endif
650         };
651
652         unsigned k;
653         int r = 0;
654
655         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
656                 _cleanup_free_ char *where = NULL;
657 #ifdef HAVE_SELINUX
658                 _cleanup_free_ char *options = NULL;
659 #endif
660                 const char *o;
661                 int t;
662
663                 where = strjoin(dest, "/", mount_table[k].where, NULL);
664                 if (!where)
665                         return log_oom();
666
667                 t = path_is_mount_point(where, true);
668                 if (t < 0) {
669                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
670
671                         if (r == 0)
672                                 r = t;
673
674                         continue;
675                 }
676
677                 /* Skip this entry if it is not a remount. */
678                 if (mount_table[k].what && t > 0)
679                         continue;
680
681                 mkdir_p(where, 0755);
682
683 #ifdef HAVE_SELINUX
684                 if (arg_selinux_apifs_context &&
685                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
686                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
687                         if (!options)
688                                 return log_oom();
689
690                         o = options;
691                 } else
692 #endif
693                         o = mount_table[k].options;
694
695
696                 if (mount(mount_table[k].what,
697                           where,
698                           mount_table[k].type,
699                           mount_table[k].flags,
700                           o) < 0 &&
701                     mount_table[k].fatal) {
702
703                         log_error("mount(%s) failed: %m", where);
704
705                         if (r == 0)
706                                 r = -errno;
707                 }
708         }
709
710         return r;
711 }
712
713 static int mount_binds(const char *dest, char **l, bool ro) {
714         char **x, **y;
715
716         STRV_FOREACH_PAIR(x, y, l) {
717                 _cleanup_free_ char *where = NULL;
718                 struct stat source_st, dest_st;
719                 int r;
720
721                 if (stat(*x, &source_st) < 0) {
722                         log_error("Failed to stat %s: %m", *x);
723                         return -errno;
724                 }
725
726                 where = strappend(dest, *y);
727                 if (!where)
728                         return log_oom();
729
730                 r = stat(where, &dest_st);
731                 if (r == 0) {
732                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
733                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
734                                 return -EINVAL;
735                         }
736                 } else if (errno == ENOENT) {
737                         r = mkdir_parents_label(where, 0755);
738                         if (r < 0) {
739                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
740                                 return r;
741                         }
742                 } else {
743                         log_error("Failed to bind mount %s: %m", *x);
744                         return -errno;
745                 }
746
747                 /* Create the mount point, but be conservative -- refuse to create block
748                  * and char devices. */
749                 if (S_ISDIR(source_st.st_mode))
750                         mkdir_label(where, 0755);
751                 else if (S_ISFIFO(source_st.st_mode))
752                         mkfifo(where, 0644);
753                 else if (S_ISSOCK(source_st.st_mode))
754                         mknod(where, 0644 | S_IFSOCK, 0);
755                 else if (S_ISREG(source_st.st_mode))
756                         touch(where);
757                 else {
758                         log_error("Refusing to create mountpoint for file: %s", *x);
759                         return -ENOTSUP;
760                 }
761
762                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
763                         log_error("mount(%s) failed: %m", where);
764                         return -errno;
765                 }
766
767                 if (ro) {
768                         r = bind_remount_recursive(where, true);
769                         if (r < 0) {
770                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
771                                 return r;
772                         }
773                 }
774         }
775
776         return 0;
777 }
778
779 static int mount_tmpfs(const char *dest) {
780         char **i, **o;
781
782         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
783                 _cleanup_free_ char *where = NULL;
784
785                 where = strappend(dest, *i);
786                 if (!where)
787                         return log_oom();
788
789                 mkdir_label(where, 0755);
790
791                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
792                         log_error("tmpfs mount to %s failed: %m", where);
793                         return -errno;
794                 }
795         }
796
797         return 0;
798 }
799
800 static int setup_timezone(const char *dest) {
801         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
802         char *z, *y;
803         int r;
804
805         assert(dest);
806
807         /* Fix the timezone, if possible */
808         r = readlink_malloc("/etc/localtime", &p);
809         if (r < 0) {
810                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
811                 return 0;
812         }
813
814         z = path_startswith(p, "../usr/share/zoneinfo/");
815         if (!z)
816                 z = path_startswith(p, "/usr/share/zoneinfo/");
817         if (!z) {
818                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
819                 return 0;
820         }
821
822         where = strappend(dest, "/etc/localtime");
823         if (!where)
824                 return log_oom();
825
826         r = readlink_malloc(where, &q);
827         if (r >= 0) {
828                 y = path_startswith(q, "../usr/share/zoneinfo/");
829                 if (!y)
830                         y = path_startswith(q, "/usr/share/zoneinfo/");
831
832                 /* Already pointing to the right place? Then do nothing .. */
833                 if (y && streq(y, z))
834                         return 0;
835         }
836
837         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
838         if (!check)
839                 return log_oom();
840
841         if (access(check, F_OK) < 0) {
842                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
843                 return 0;
844         }
845
846         what = strappend("../usr/share/zoneinfo/", z);
847         if (!what)
848                 return log_oom();
849
850         mkdir_parents(where, 0755);
851         unlink(where);
852
853         if (symlink(what, where) < 0) {
854                 log_error("Failed to correct timezone of container: %m");
855                 return 0;
856         }
857
858         return 0;
859 }
860
861 static int setup_resolv_conf(const char *dest) {
862         _cleanup_free_ char *where = NULL;
863
864         assert(dest);
865
866         if (arg_private_network)
867                 return 0;
868
869         /* Fix resolv.conf, if possible */
870         where = strappend(dest, "/etc/resolv.conf");
871         if (!where)
872                 return log_oom();
873
874         /* We don't really care for the results of this really. If it
875          * fails, it fails, but meh... */
876         mkdir_parents(where, 0755);
877         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
878
879         return 0;
880 }
881
882 static int setup_volatile_state(const char *directory) {
883         const char *p;
884         int r;
885
886         assert(directory);
887
888         if (arg_volatile != VOLATILE_STATE)
889                 return 0;
890
891         /* --volatile=state means we simply overmount /var
892            with a tmpfs, and the rest read-only. */
893
894         r = bind_remount_recursive(directory, true);
895         if (r < 0) {
896                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
897                 return r;
898         }
899
900         p = strappenda(directory, "/var");
901         mkdir(p, 0755);
902
903         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
904                 log_error("Failed to mount tmpfs to /var: %m");
905                 return -errno;
906         }
907
908         return 0;
909 }
910
911 static int setup_volatile(const char *directory) {
912         bool tmpfs_mounted = false, bind_mounted = false;
913         char template[] = "/tmp/nspawn-volatile-XXXXXX";
914         const char *f, *t;
915         int r;
916
917         assert(directory);
918
919         if (arg_volatile != VOLATILE_YES)
920                 return 0;
921
922         /* --volatile=yes means we mount a tmpfs to the root dir, and
923            the original /usr to use inside it, and that read-only. */
924
925         if (!mkdtemp(template)) {
926                 log_error("Failed to create temporary directory: %m");
927                 return -errno;
928         }
929
930         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
931                 log_error("Failed to mount tmpfs for root directory: %m");
932                 r = -errno;
933                 goto fail;
934         }
935
936         tmpfs_mounted = true;
937
938         f = strappenda(directory, "/usr");
939         t = strappenda(template, "/usr");
940
941         mkdir(t, 0755);
942         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
943                 log_error("Failed to create /usr bind mount: %m");
944                 r = -errno;
945                 goto fail;
946         }
947
948         bind_mounted = true;
949
950         r = bind_remount_recursive(t, true);
951         if (r < 0) {
952                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
953                 goto fail;
954         }
955
956         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
957                 log_error("Failed to move root mount: %m");
958                 r = -errno;
959                 goto fail;
960         }
961
962         rmdir(template);
963
964         return 0;
965
966 fail:
967         if (bind_mounted)
968                 umount(t);
969         if (tmpfs_mounted)
970                 umount(template);
971         rmdir(template);
972         return r;
973 }
974
975 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
976
977         snprintf(s, 37,
978                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
979                  SD_ID128_FORMAT_VAL(id));
980
981         return s;
982 }
983
984 static int setup_boot_id(const char *dest) {
985         _cleanup_free_ char *from = NULL, *to = NULL;
986         sd_id128_t rnd = {};
987         char as_uuid[37];
988         int r;
989
990         assert(dest);
991
992         if (arg_share_system)
993                 return 0;
994
995         /* Generate a new randomized boot ID, so that each boot-up of
996          * the container gets a new one */
997
998         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
999         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1000         if (!from || !to)
1001                 return log_oom();
1002
1003         r = sd_id128_randomize(&rnd);
1004         if (r < 0) {
1005                 log_error("Failed to generate random boot id: %s", strerror(-r));
1006                 return r;
1007         }
1008
1009         id128_format_as_uuid(rnd, as_uuid);
1010
1011         r = write_string_file(from, as_uuid);
1012         if (r < 0) {
1013                 log_error("Failed to write boot id: %s", strerror(-r));
1014                 return r;
1015         }
1016
1017         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1018                 log_error("Failed to bind mount boot id: %m");
1019                 r = -errno;
1020         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1021                 log_warning("Failed to make boot id read-only: %m");
1022
1023         unlink(from);
1024         return r;
1025 }
1026
1027 static int copy_devnodes(const char *dest) {
1028
1029         static const char devnodes[] =
1030                 "null\0"
1031                 "zero\0"
1032                 "full\0"
1033                 "random\0"
1034                 "urandom\0"
1035                 "tty\0";
1036
1037         const char *d;
1038         int r = 0;
1039         _cleanup_umask_ mode_t u;
1040
1041         assert(dest);
1042
1043         u = umask(0000);
1044
1045         NULSTR_FOREACH(d, devnodes) {
1046                 _cleanup_free_ char *from = NULL, *to = NULL;
1047                 struct stat st;
1048
1049                 from = strappend("/dev/", d);
1050                 to = strjoin(dest, "/dev/", d, NULL);
1051                 if (!from || !to)
1052                         return log_oom();
1053
1054                 if (stat(from, &st) < 0) {
1055
1056                         if (errno != ENOENT) {
1057                                 log_error("Failed to stat %s: %m", from);
1058                                 return -errno;
1059                         }
1060
1061                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1062
1063                         log_error("%s is not a char or block device, cannot copy", from);
1064                         return -EIO;
1065
1066                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1067
1068                         log_error("mknod(%s) failed: %m", dest);
1069                         return  -errno;
1070                 }
1071         }
1072
1073         return r;
1074 }
1075
1076 static int setup_ptmx(const char *dest) {
1077         _cleanup_free_ char *p = NULL;
1078
1079         p = strappend(dest, "/dev/ptmx");
1080         if (!p)
1081                 return log_oom();
1082
1083         if (symlink("pts/ptmx", p) < 0) {
1084                 log_error("Failed to create /dev/ptmx symlink: %m");
1085                 return -errno;
1086         }
1087
1088         return 0;
1089 }
1090
1091 static int setup_dev_console(const char *dest, const char *console) {
1092         _cleanup_umask_ mode_t u;
1093         const char *to;
1094         struct stat st;
1095         int r;
1096
1097         assert(dest);
1098         assert(console);
1099
1100         u = umask(0000);
1101
1102         if (stat("/dev/null", &st) < 0) {
1103                 log_error("Failed to stat /dev/null: %m");
1104                 return -errno;
1105         }
1106
1107         r = chmod_and_chown(console, 0600, 0, 0);
1108         if (r < 0) {
1109                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1110                 return r;
1111         }
1112
1113         /* We need to bind mount the right tty to /dev/console since
1114          * ptys can only exist on pts file systems. To have something
1115          * to bind mount things on we create a device node first, and
1116          * use /dev/null for that since we the cgroups device policy
1117          * allows us to create that freely, while we cannot create
1118          * /dev/console. (Note that the major minor doesn't actually
1119          * matter here, since we mount it over anyway). */
1120
1121         to = strappenda(dest, "/dev/console");
1122         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1123                 log_error("mknod() for /dev/console failed: %m");
1124                 return -errno;
1125         }
1126
1127         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1128                 log_error("Bind mount for /dev/console failed: %m");
1129                 return -errno;
1130         }
1131
1132         return 0;
1133 }
1134
1135 static int setup_kmsg(const char *dest, int kmsg_socket) {
1136         _cleanup_free_ char *from = NULL, *to = NULL;
1137         int r, fd, k;
1138         _cleanup_umask_ mode_t u;
1139         union {
1140                 struct cmsghdr cmsghdr;
1141                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1142         } control = {};
1143         struct msghdr mh = {
1144                 .msg_control = &control,
1145                 .msg_controllen = sizeof(control),
1146         };
1147         struct cmsghdr *cmsg;
1148
1149         assert(dest);
1150         assert(kmsg_socket >= 0);
1151
1152         u = umask(0000);
1153
1154         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1155          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1156          * on the reading side behave very similar to /proc/kmsg,
1157          * their writing side behaves differently from /dev/kmsg in
1158          * that writing blocks when nothing is reading. In order to
1159          * avoid any problems with containers deadlocking due to this
1160          * we simply make /dev/kmsg unavailable to the container. */
1161         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1162             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1163                 return log_oom();
1164
1165         if (mkfifo(from, 0600) < 0) {
1166                 log_error("mkfifo() for /dev/kmsg failed: %m");
1167                 return -errno;
1168         }
1169
1170         r = chmod_and_chown(from, 0600, 0, 0);
1171         if (r < 0) {
1172                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1173                 return r;
1174         }
1175
1176         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1177                 log_error("Bind mount for /proc/kmsg failed: %m");
1178                 return -errno;
1179         }
1180
1181         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1182         if (fd < 0) {
1183                 log_error("Failed to open fifo: %m");
1184                 return -errno;
1185         }
1186
1187         cmsg = CMSG_FIRSTHDR(&mh);
1188         cmsg->cmsg_level = SOL_SOCKET;
1189         cmsg->cmsg_type = SCM_RIGHTS;
1190         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1191         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1192
1193         mh.msg_controllen = cmsg->cmsg_len;
1194
1195         /* Store away the fd in the socket, so that it stays open as
1196          * long as we run the child */
1197         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1198         safe_close(fd);
1199
1200         if (k < 0) {
1201                 log_error("Failed to send FIFO fd: %m");
1202                 return -errno;
1203         }
1204
1205         /* And now make the FIFO unavailable as /dev/kmsg... */
1206         unlink(from);
1207         return 0;
1208 }
1209
1210 static int setup_hostname(void) {
1211
1212         if (arg_share_system)
1213                 return 0;
1214
1215         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1216                 return -errno;
1217
1218         return 0;
1219 }
1220
1221 static int setup_journal(const char *directory) {
1222         sd_id128_t machine_id, this_id;
1223         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1224         char *id;
1225         int r;
1226
1227         p = strappend(directory, "/etc/machine-id");
1228         if (!p)
1229                 return log_oom();
1230
1231         r = read_one_line_file(p, &b);
1232         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1233                 return 0;
1234         else if (r < 0) {
1235                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1236                 return r;
1237         }
1238
1239         id = strstrip(b);
1240         if (isempty(id) && arg_link_journal == LINK_AUTO)
1241                 return 0;
1242
1243         /* Verify validity */
1244         r = sd_id128_from_string(id, &machine_id);
1245         if (r < 0) {
1246                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1247                 return r;
1248         }
1249
1250         r = sd_id128_get_machine(&this_id);
1251         if (r < 0) {
1252                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1253                 return r;
1254         }
1255
1256         if (sd_id128_equal(machine_id, this_id)) {
1257                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1258                          "Host and machine ids are equal (%s): refusing to link journals", id);
1259                 if (arg_link_journal == LINK_AUTO)
1260                         return 0;
1261                 return
1262                         -EEXIST;
1263         }
1264
1265         if (arg_link_journal == LINK_NO)
1266                 return 0;
1267
1268         free(p);
1269         p = strappend("/var/log/journal/", id);
1270         q = strjoin(directory, "/var/log/journal/", id, NULL);
1271         if (!p || !q)
1272                 return log_oom();
1273
1274         if (path_is_mount_point(p, false) > 0) {
1275                 if (arg_link_journal != LINK_AUTO) {
1276                         log_error("%s: already a mount point, refusing to use for journal", p);
1277                         return -EEXIST;
1278                 }
1279
1280                 return 0;
1281         }
1282
1283         if (path_is_mount_point(q, false) > 0) {
1284                 if (arg_link_journal != LINK_AUTO) {
1285                         log_error("%s: already a mount point, refusing to use for journal", q);
1286                         return -EEXIST;
1287                 }
1288
1289                 return 0;
1290         }
1291
1292         r = readlink_and_make_absolute(p, &d);
1293         if (r >= 0) {
1294                 if ((arg_link_journal == LINK_GUEST ||
1295                      arg_link_journal == LINK_AUTO) &&
1296                     path_equal(d, q)) {
1297
1298                         r = mkdir_p(q, 0755);
1299                         if (r < 0)
1300                                 log_warning("failed to create directory %s: %m", q);
1301                         return 0;
1302                 }
1303
1304                 if (unlink(p) < 0) {
1305                         log_error("Failed to remove symlink %s: %m", p);
1306                         return -errno;
1307                 }
1308         } else if (r == -EINVAL) {
1309
1310                 if (arg_link_journal == LINK_GUEST &&
1311                     rmdir(p) < 0) {
1312
1313                         if (errno == ENOTDIR) {
1314                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1315                                 return r;
1316                         } else {
1317                                 log_error("Failed to remove %s: %m", p);
1318                                 return -errno;
1319                         }
1320                 }
1321         } else if (r != -ENOENT) {
1322                 log_error("readlink(%s) failed: %m", p);
1323                 return r;
1324         }
1325
1326         if (arg_link_journal == LINK_GUEST) {
1327
1328                 if (symlink(q, p) < 0) {
1329                         log_error("Failed to symlink %s to %s: %m", q, p);
1330                         return -errno;
1331                 }
1332
1333                 r = mkdir_p(q, 0755);
1334                 if (r < 0)
1335                         log_warning("failed to create directory %s: %m", q);
1336                 return 0;
1337         }
1338
1339         if (arg_link_journal == LINK_HOST) {
1340                 r = mkdir_p(p, 0755);
1341                 if (r < 0) {
1342                         log_error("Failed to create %s: %m", p);
1343                         return r;
1344                 }
1345
1346         } else if (access(p, F_OK) < 0)
1347                 return 0;
1348
1349         if (dir_is_empty(q) == 0)
1350                 log_warning("%s is not empty, proceeding anyway.", q);
1351
1352         r = mkdir_p(q, 0755);
1353         if (r < 0) {
1354                 log_error("Failed to create %s: %m", q);
1355                 return r;
1356         }
1357
1358         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1359                 log_error("Failed to bind mount journal from host into guest: %m");
1360                 return -errno;
1361         }
1362
1363         return 0;
1364 }
1365
1366 static int setup_kdbus(const char *dest, const char *path) {
1367         const char *p;
1368
1369         if (!path)
1370                 return 0;
1371
1372         p = strappenda(dest, "/dev/kdbus");
1373         if (mkdir(p, 0755) < 0) {
1374                 log_error("Failed to create kdbus path: %m");
1375                 return  -errno;
1376         }
1377
1378         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1379                 log_error("Failed to mount kdbus domain path: %m");
1380                 return -errno;
1381         }
1382
1383         return 0;
1384 }
1385
1386 static int drop_capabilities(void) {
1387         return capability_bounding_set_drop(~arg_retain, false);
1388 }
1389
1390 static int register_machine(pid_t pid, int local_ifindex) {
1391         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1392         _cleanup_bus_unref_ sd_bus *bus = NULL;
1393         int r;
1394
1395         if (!arg_register)
1396                 return 0;
1397
1398         r = sd_bus_default_system(&bus);
1399         if (r < 0) {
1400                 log_error("Failed to open system bus: %s", strerror(-r));
1401                 return r;
1402         }
1403
1404         if (arg_keep_unit) {
1405                 r = sd_bus_call_method(
1406                                 bus,
1407                                 "org.freedesktop.machine1",
1408                                 "/org/freedesktop/machine1",
1409                                 "org.freedesktop.machine1.Manager",
1410                                 "RegisterMachineWithNetwork",
1411                                 &error,
1412                                 NULL,
1413                                 "sayssusai",
1414                                 arg_machine,
1415                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1416                                 "nspawn",
1417                                 "container",
1418                                 (uint32_t) pid,
1419                                 strempty(arg_directory),
1420                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1421         } else {
1422                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1423
1424                 r = sd_bus_message_new_method_call(
1425                                 bus,
1426                                 &m,
1427                                 "org.freedesktop.machine1",
1428                                 "/org/freedesktop/machine1",
1429                                 "org.freedesktop.machine1.Manager",
1430                                 "CreateMachineWithNetwork");
1431                 if (r < 0) {
1432                         log_error("Failed to create message: %s", strerror(-r));
1433                         return r;
1434                 }
1435
1436                 r = sd_bus_message_append(
1437                                 m,
1438                                 "sayssusai",
1439                                 arg_machine,
1440                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1441                                 "nspawn",
1442                                 "container",
1443                                 (uint32_t) pid,
1444                                 strempty(arg_directory),
1445                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1446                 if (r < 0) {
1447                         log_error("Failed to append message arguments: %s", strerror(-r));
1448                         return r;
1449                 }
1450
1451                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1452                 if (r < 0) {
1453                         log_error("Failed to open container: %s", strerror(-r));
1454                         return r;
1455                 }
1456
1457                 if (!isempty(arg_slice)) {
1458                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1459                         if (r < 0) {
1460                                 log_error("Failed to append slice: %s", strerror(-r));
1461                                 return r;
1462                         }
1463                 }
1464
1465                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1466                 if (r < 0) {
1467                         log_error("Failed to add device policy: %s", strerror(-r));
1468                         return r;
1469                 }
1470
1471                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1472                                           /* Allow the container to
1473                                            * access and create the API
1474                                            * device nodes, so that
1475                                            * PrivateDevices= in the
1476                                            * container can work
1477                                            * fine */
1478                                           "/dev/null", "rwm",
1479                                           "/dev/zero", "rwm",
1480                                           "/dev/full", "rwm",
1481                                           "/dev/random", "rwm",
1482                                           "/dev/urandom", "rwm",
1483                                           "/dev/tty", "rwm",
1484                                           /* Allow the container
1485                                            * access to ptys. However,
1486                                            * do not permit the
1487                                            * container to ever create
1488                                            * these device nodes. */
1489                                           "/dev/pts/ptmx", "rw",
1490                                           "char-pts", "rw",
1491                                           /* Allow the container
1492                                            * access to all kdbus
1493                                            * devices. Again, the
1494                                            * container cannot create
1495                                            * these nodes, only use
1496                                            * them. We use a pretty
1497                                            * open match here, so that
1498                                            * the kernel API can still
1499                                            * change. */
1500                                           "char-kdbus", "rw",
1501                                           "char-kdbus/*", "rw");
1502                 if (r < 0) {
1503                         log_error("Failed to add device whitelist: %s", strerror(-r));
1504                         return r;
1505                 }
1506
1507                 r = sd_bus_message_close_container(m);
1508                 if (r < 0) {
1509                         log_error("Failed to close container: %s", strerror(-r));
1510                         return r;
1511                 }
1512
1513                 r = sd_bus_call(bus, m, 0, &error, NULL);
1514         }
1515
1516         if (r < 0) {
1517                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1518                 return r;
1519         }
1520
1521         return 0;
1522 }
1523
1524 static int terminate_machine(pid_t pid) {
1525         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1526         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1527         _cleanup_bus_unref_ sd_bus *bus = NULL;
1528         const char *path;
1529         int r;
1530
1531         if (!arg_register)
1532                 return 0;
1533
1534         r = sd_bus_default_system(&bus);
1535         if (r < 0) {
1536                 log_error("Failed to open system bus: %s", strerror(-r));
1537                 return r;
1538         }
1539
1540         r = sd_bus_call_method(
1541                         bus,
1542                         "org.freedesktop.machine1",
1543                         "/org/freedesktop/machine1",
1544                         "org.freedesktop.machine1.Manager",
1545                         "GetMachineByPID",
1546                         &error,
1547                         &reply,
1548                         "u",
1549                         (uint32_t) pid);
1550         if (r < 0) {
1551                 /* Note that the machine might already have been
1552                  * cleaned up automatically, hence don't consider it a
1553                  * failure if we cannot get the machine object. */
1554                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1555                 return 0;
1556         }
1557
1558         r = sd_bus_message_read(reply, "o", &path);
1559         if (r < 0)
1560                 return bus_log_parse_error(r);
1561
1562         r = sd_bus_call_method(
1563                         bus,
1564                         "org.freedesktop.machine1",
1565                         path,
1566                         "org.freedesktop.machine1.Machine",
1567                         "Terminate",
1568                         &error,
1569                         NULL,
1570                         NULL);
1571         if (r < 0) {
1572                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1573                 return 0;
1574         }
1575
1576         return 0;
1577 }
1578
1579 static int reset_audit_loginuid(void) {
1580         _cleanup_free_ char *p = NULL;
1581         int r;
1582
1583         if (arg_share_system)
1584                 return 0;
1585
1586         r = read_one_line_file("/proc/self/loginuid", &p);
1587         if (r == -ENOENT)
1588                 return 0;
1589         if (r < 0) {
1590                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1591                 return r;
1592         }
1593
1594         /* Already reset? */
1595         if (streq(p, "4294967295"))
1596                 return 0;
1597
1598         r = write_string_file("/proc/self/loginuid", "4294967295");
1599         if (r < 0) {
1600                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1601                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1602                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1603                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1604                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1605
1606                 sleep(5);
1607         }
1608
1609         return 0;
1610 }
1611
1612 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1613
1614 static int get_mac(struct ether_addr *mac) {
1615         int r;
1616
1617         uint8_t result[8];
1618         size_t l, sz;
1619         uint8_t *v;
1620
1621         l = strlen(arg_machine);
1622         sz = sizeof(sd_id128_t) + l;
1623         v = alloca(sz);
1624
1625         /* fetch some persistent data unique to the host */
1626         r = sd_id128_get_machine((sd_id128_t*) v);
1627         if (r < 0)
1628                 return r;
1629
1630         /* combine with some data unique (on this host) to this
1631          * container instance */
1632         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1633
1634         /* Let's hash the host machine ID plus the container name. We
1635          * use a fixed, but originally randomly created hash key here. */
1636         siphash24(result, v, sz, HASH_KEY.bytes);
1637
1638         assert_cc(ETH_ALEN <= sizeof(result));
1639         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1640
1641         /* see eth_random_addr in the kernel */
1642         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1643         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1644
1645         return 0;
1646 }
1647
1648 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1649         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1650         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1651         struct ether_addr mac;
1652         int r, i;
1653
1654         if (!arg_private_network)
1655                 return 0;
1656
1657         if (!arg_network_veth)
1658                 return 0;
1659
1660         /* Use two different interface name prefixes depending whether
1661          * we are in bridge mode or not. */
1662         snprintf(iface_name, IFNAMSIZ, "%s-%s",
1663                  arg_network_bridge ? "vb" : "ve", arg_machine);
1664
1665         r = get_mac(&mac);
1666         if (r < 0) {
1667                 log_error("Failed to generate predictable MAC address for host0");
1668                 return r;
1669         }
1670
1671         r = sd_rtnl_open(&rtnl, 0);
1672         if (r < 0) {
1673                 log_error("Failed to connect to netlink: %s", strerror(-r));
1674                 return r;
1675         }
1676
1677         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1678         if (r < 0) {
1679                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1680                 return r;
1681         }
1682
1683         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1684         if (r < 0) {
1685                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1686                 return r;
1687         }
1688
1689         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1690         if (r < 0) {
1691                 log_error("Failed to open netlink container: %s", strerror(-r));
1692                 return r;
1693         }
1694
1695         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1696         if (r < 0) {
1697                 log_error("Failed to open netlink container: %s", strerror(-r));
1698                 return r;
1699         }
1700
1701         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1702         if (r < 0) {
1703                 log_error("Failed to open netlink container: %s", strerror(-r));
1704                 return r;
1705         }
1706
1707         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1708         if (r < 0) {
1709                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1710                 return r;
1711         }
1712
1713         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1714         if (r < 0) {
1715                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1716                 return r;
1717         }
1718
1719         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1720         if (r < 0) {
1721                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1722                 return r;
1723         }
1724
1725         r = sd_rtnl_message_close_container(m);
1726         if (r < 0) {
1727                 log_error("Failed to close netlink container: %s", strerror(-r));
1728                 return r;
1729         }
1730
1731         r = sd_rtnl_message_close_container(m);
1732         if (r < 0) {
1733                 log_error("Failed to close netlink container: %s", strerror(-r));
1734                 return r;
1735         }
1736
1737         r = sd_rtnl_message_close_container(m);
1738         if (r < 0) {
1739                 log_error("Failed to close netlink container: %s", strerror(-r));
1740                 return r;
1741         }
1742
1743         r = sd_rtnl_call(rtnl, m, 0, NULL);
1744         if (r < 0) {
1745                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1746                 return r;
1747         }
1748
1749         i = (int) if_nametoindex(iface_name);
1750         if (i <= 0) {
1751                 log_error("Failed to resolve interface %s: %m", iface_name);
1752                 return -errno;
1753         }
1754
1755         *ifi = i;
1756
1757         return 0;
1758 }
1759
1760 static int setup_bridge(const char veth_name[], int *ifi) {
1761         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1762         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1763         int r, bridge;
1764
1765         if (!arg_private_network)
1766                 return 0;
1767
1768         if (!arg_network_veth)
1769                 return 0;
1770
1771         if (!arg_network_bridge)
1772                 return 0;
1773
1774         bridge = (int) if_nametoindex(arg_network_bridge);
1775         if (bridge <= 0) {
1776                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1777                 return -errno;
1778         }
1779
1780         *ifi = bridge;
1781
1782         r = sd_rtnl_open(&rtnl, 0);
1783         if (r < 0) {
1784                 log_error("Failed to connect to netlink: %s", strerror(-r));
1785                 return r;
1786         }
1787
1788         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1789         if (r < 0) {
1790                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1791                 return r;
1792         }
1793
1794         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1795         if (r < 0) {
1796                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1797                 return r;
1798         }
1799
1800         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1801         if (r < 0) {
1802                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1803                 return r;
1804         }
1805
1806         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1807         if (r < 0) {
1808                 log_error("Failed to add netlink master field: %s", strerror(-r));
1809                 return r;
1810         }
1811
1812         r = sd_rtnl_call(rtnl, m, 0, NULL);
1813         if (r < 0) {
1814                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1815                 return r;
1816         }
1817
1818         return 0;
1819 }
1820
1821 static int parse_interface(struct udev *udev, const char *name) {
1822         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1823         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1824         int ifi;
1825
1826         ifi = (int) if_nametoindex(name);
1827         if (ifi <= 0) {
1828                 log_error("Failed to resolve interface %s: %m", name);
1829                 return -errno;
1830         }
1831
1832         sprintf(ifi_str, "n%i", ifi);
1833         d = udev_device_new_from_device_id(udev, ifi_str);
1834         if (!d) {
1835                 log_error("Failed to get udev device for interface %s: %m", name);
1836                 return -errno;
1837         }
1838
1839         if (udev_device_get_is_initialized(d) <= 0) {
1840                 log_error("Network interface %s is not initialized yet.", name);
1841                 return -EBUSY;
1842         }
1843
1844         return ifi;
1845 }
1846
1847 static int move_network_interfaces(pid_t pid) {
1848         _cleanup_udev_unref_ struct udev *udev = NULL;
1849         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1850         char **i;
1851         int r;
1852
1853         if (!arg_private_network)
1854                 return 0;
1855
1856         if (strv_isempty(arg_network_interfaces))
1857                 return 0;
1858
1859         r = sd_rtnl_open(&rtnl, 0);
1860         if (r < 0) {
1861                 log_error("Failed to connect to netlink: %s", strerror(-r));
1862                 return r;
1863         }
1864
1865         udev = udev_new();
1866         if (!udev) {
1867                 log_error("Failed to connect to udev.");
1868                 return -ENOMEM;
1869         }
1870
1871         STRV_FOREACH(i, arg_network_interfaces) {
1872                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1873                 int ifi;
1874
1875                 ifi = parse_interface(udev, *i);
1876                 if (ifi < 0)
1877                         return ifi;
1878
1879                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1880                 if (r < 0) {
1881                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1882                         return r;
1883                 }
1884
1885                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1886                 if (r < 0) {
1887                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1888                         return r;
1889                 }
1890
1891                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1892                 if (r < 0) {
1893                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1894                         return r;
1895                 }
1896         }
1897
1898         return 0;
1899 }
1900
1901 static int setup_macvlan(pid_t pid) {
1902         _cleanup_udev_unref_ struct udev *udev = NULL;
1903         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1904         char **i;
1905         int r;
1906
1907         if (!arg_private_network)
1908                 return 0;
1909
1910         if (strv_isempty(arg_network_macvlan))
1911                 return 0;
1912
1913         r = sd_rtnl_open(&rtnl, 0);
1914         if (r < 0) {
1915                 log_error("Failed to connect to netlink: %s", strerror(-r));
1916                 return r;
1917         }
1918
1919         udev = udev_new();
1920         if (!udev) {
1921                 log_error("Failed to connect to udev.");
1922                 return -ENOMEM;
1923         }
1924
1925         STRV_FOREACH(i, arg_network_macvlan) {
1926                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1927                 _cleanup_free_ char *n = NULL;
1928                 int ifi;
1929
1930                 ifi = parse_interface(udev, *i);
1931                 if (ifi < 0)
1932                         return ifi;
1933
1934                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1935                 if (r < 0) {
1936                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1937                         return r;
1938                 }
1939
1940                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1941                 if (r < 0) {
1942                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1943                         return r;
1944                 }
1945
1946                 n = strappend("mv-", *i);
1947                 if (!n)
1948                         return log_oom();
1949
1950                 strshorten(n, IFNAMSIZ-1);
1951
1952                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1953                 if (r < 0) {
1954                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1955                         return r;
1956                 }
1957
1958                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1959                 if (r < 0) {
1960                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1961                         return r;
1962                 }
1963
1964                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1965                 if (r < 0) {
1966                         log_error("Failed to open netlink container: %s", strerror(-r));
1967                         return r;
1968                 }
1969
1970                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1971                 if (r < 0) {
1972                         log_error("Failed to open netlink container: %s", strerror(-r));
1973                         return r;
1974                 }
1975
1976                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1977                 if (r < 0) {
1978                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1979                         return r;
1980                 }
1981
1982                 r = sd_rtnl_message_close_container(m);
1983                 if (r < 0) {
1984                         log_error("Failed to close netlink container: %s", strerror(-r));
1985                         return r;
1986                 }
1987
1988                 r = sd_rtnl_message_close_container(m);
1989                 if (r < 0) {
1990                         log_error("Failed to close netlink container: %s", strerror(-r));
1991                         return r;
1992                 }
1993
1994                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1995                 if (r < 0) {
1996                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1997                         return r;
1998                 }
1999         }
2000
2001         return 0;
2002 }
2003
2004 static int setup_seccomp(void) {
2005
2006 #ifdef HAVE_SECCOMP
2007         static const int blacklist[] = {
2008                 SCMP_SYS(kexec_load),
2009                 SCMP_SYS(open_by_handle_at),
2010                 SCMP_SYS(init_module),
2011                 SCMP_SYS(finit_module),
2012                 SCMP_SYS(delete_module),
2013                 SCMP_SYS(iopl),
2014                 SCMP_SYS(ioperm),
2015                 SCMP_SYS(swapon),
2016                 SCMP_SYS(swapoff),
2017         };
2018
2019         scmp_filter_ctx seccomp;
2020         unsigned i;
2021         int r;
2022
2023         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2024         if (!seccomp)
2025                 return log_oom();
2026
2027         r = seccomp_add_secondary_archs(seccomp);
2028         if (r < 0) {
2029                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2030                 goto finish;
2031         }
2032
2033         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2034                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2035                 if (r == -EFAULT)
2036                         continue; /* unknown syscall */
2037                 if (r < 0) {
2038                         log_error("Failed to block syscall: %s", strerror(-r));
2039                         goto finish;
2040                 }
2041         }
2042
2043         /*
2044            Audit is broken in containers, much of the userspace audit
2045            hookup will fail if running inside a container. We don't
2046            care and just turn off creation of audit sockets.
2047
2048            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2049            with EAFNOSUPPORT which audit userspace uses as indication
2050            that audit is disabled in the kernel.
2051          */
2052
2053         r = seccomp_rule_add(
2054                         seccomp,
2055                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2056                         SCMP_SYS(socket),
2057                         2,
2058                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2059                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2060         if (r < 0) {
2061                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2062                 goto finish;
2063         }
2064
2065         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2066         if (r < 0) {
2067                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2068                 goto finish;
2069         }
2070
2071         r = seccomp_load(seccomp);
2072         if (r < 0)
2073                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2074
2075 finish:
2076         seccomp_release(seccomp);
2077         return r;
2078 #else
2079         return 0;
2080 #endif
2081
2082 }
2083
2084 static int setup_image(char **device_path, int *loop_nr) {
2085         struct loop_info64 info = {
2086                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2087         };
2088         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2089         _cleanup_free_ char* loopdev = NULL;
2090         struct stat st;
2091         int r, nr;
2092
2093         assert(device_path);
2094         assert(loop_nr);
2095
2096         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2097         if (fd < 0) {
2098                 log_error("Failed to open %s: %m", arg_image);
2099                 return -errno;
2100         }
2101
2102         if (fstat(fd, &st) < 0) {
2103                 log_error("Failed to stat %s: %m", arg_image);
2104                 return -errno;
2105         }
2106
2107         if (S_ISBLK(st.st_mode)) {
2108                 char *p;
2109
2110                 p = strdup(arg_image);
2111                 if (!p)
2112                         return log_oom();
2113
2114                 *device_path = p;
2115
2116                 *loop_nr = -1;
2117
2118                 r = fd;
2119                 fd = -1;
2120
2121                 return r;
2122         }
2123
2124         if (!S_ISREG(st.st_mode)) {
2125                 log_error("%s is not a regular file or block device: %m", arg_image);
2126                 return -EINVAL;
2127         }
2128
2129         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2130         if (control < 0) {
2131                 log_error("Failed to open /dev/loop-control: %m");
2132                 return -errno;
2133         }
2134
2135         nr = ioctl(control, LOOP_CTL_GET_FREE);
2136         if (nr < 0) {
2137                 log_error("Failed to allocate loop device: %m");
2138                 return -errno;
2139         }
2140
2141         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2142                 return log_oom();
2143
2144         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2145         if (loop < 0) {
2146                 log_error("Failed to open loop device %s: %m", loopdev);
2147                 return -errno;
2148         }
2149
2150         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2151                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2152                 return -errno;
2153         }
2154
2155         if (arg_read_only)
2156                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2157
2158         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2159                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2160                 return -errno;
2161         }
2162
2163         *device_path = loopdev;
2164         loopdev = NULL;
2165
2166         *loop_nr = nr;
2167
2168         r = loop;
2169         loop = -1;
2170
2171         return r;
2172 }
2173
2174 static int dissect_image(
2175                 int fd,
2176                 char **root_device, bool *root_device_rw,
2177                 char **home_device, bool *home_device_rw,
2178                 char **srv_device, bool *srv_device_rw,
2179                 bool *secondary) {
2180
2181 #ifdef HAVE_BLKID
2182         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2183         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2184         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2185         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2186         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2187         _cleanup_udev_unref_ struct udev *udev = NULL;
2188         struct udev_list_entry *first, *item;
2189         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2190         const char *pttype = NULL;
2191         blkid_partlist pl;
2192         struct stat st;
2193         int r;
2194
2195         assert(fd >= 0);
2196         assert(root_device);
2197         assert(home_device);
2198         assert(srv_device);
2199         assert(secondary);
2200
2201         b = blkid_new_probe();
2202         if (!b)
2203                 return log_oom();
2204
2205         errno = 0;
2206         r = blkid_probe_set_device(b, fd, 0, 0);
2207         if (r != 0) {
2208                 if (errno == 0)
2209                         return log_oom();
2210
2211                 log_error("Failed to set device on blkid probe: %m");
2212                 return -errno;
2213         }
2214
2215         blkid_probe_enable_partitions(b, 1);
2216         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2217
2218         errno = 0;
2219         r = blkid_do_safeprobe(b);
2220         if (r == -2 || r == 1) {
2221                 log_error("Failed to identify any partition table on %s.\n"
2222                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2223                 return -EINVAL;
2224         } else if (r != 0) {
2225                 if (errno == 0)
2226                         errno = EIO;
2227                 log_error("Failed to probe: %m");
2228                 return -errno;
2229         }
2230
2231         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2232         if (!streq_ptr(pttype, "gpt")) {
2233                 log_error("Image %s does not carry a GUID Partition Table.\n"
2234                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2235                 return -EINVAL;
2236         }
2237
2238         errno = 0;
2239         pl = blkid_probe_get_partitions(b);
2240         if (!pl) {
2241                 if (errno == 0)
2242                         return log_oom();
2243
2244                 log_error("Failed to list partitions of %s", arg_image);
2245                 return -errno;
2246         }
2247
2248         udev = udev_new();
2249         if (!udev)
2250                 return log_oom();
2251
2252         if (fstat(fd, &st) < 0) {
2253                 log_error("Failed to stat block device: %m");
2254                 return -errno;
2255         }
2256
2257         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2258         if (!d)
2259                 return log_oom();
2260
2261         e = udev_enumerate_new(udev);
2262         if (!e)
2263                 return log_oom();
2264
2265         r = udev_enumerate_add_match_parent(e, d);
2266         if (r < 0)
2267                 return log_oom();
2268
2269         r = udev_enumerate_scan_devices(e);
2270         if (r < 0) {
2271                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2272                 return r;
2273         }
2274
2275         first = udev_enumerate_get_list_entry(e);
2276         udev_list_entry_foreach(item, first) {
2277                 _cleanup_udev_device_unref_ struct udev_device *q;
2278                 const char *stype, *node;
2279                 unsigned long long flags;
2280                 sd_id128_t type_id;
2281                 blkid_partition pp;
2282                 dev_t qn;
2283                 int nr;
2284
2285                 errno = 0;
2286                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2287                 if (!q) {
2288                         if (!errno)
2289                                 errno = ENOMEM;
2290
2291                         log_error("Failed to get partition device of %s: %m", arg_image);
2292                         return -errno;
2293                 }
2294
2295                 qn = udev_device_get_devnum(q);
2296                 if (major(qn) == 0)
2297                         continue;
2298
2299                 if (st.st_rdev == qn)
2300                         continue;
2301
2302                 node = udev_device_get_devnode(q);
2303                 if (!node)
2304                         continue;
2305
2306                 pp = blkid_partlist_devno_to_partition(pl, qn);
2307                 if (!pp)
2308                         continue;
2309
2310                 flags = blkid_partition_get_flags(pp);
2311                 if (flags & GPT_FLAG_NO_AUTO)
2312                         continue;
2313
2314                 nr = blkid_partition_get_partno(pp);
2315                 if (nr < 0)
2316                         continue;
2317
2318                 stype = blkid_partition_get_type_string(pp);
2319                 if (!stype)
2320                         continue;
2321
2322                 if (sd_id128_from_string(stype, &type_id) < 0)
2323                         continue;
2324
2325                 if (sd_id128_equal(type_id, GPT_HOME)) {
2326
2327                         if (home && nr >= home_nr)
2328                                 continue;
2329
2330                         home_nr = nr;
2331                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2332
2333                         free(home);
2334                         home = strdup(node);
2335                         if (!home)
2336                                 return log_oom();
2337                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2338
2339                         if (srv && nr >= srv_nr)
2340                                 continue;
2341
2342                         srv_nr = nr;
2343                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2344
2345                         free(srv);
2346                         srv = strdup(node);
2347                         if (!srv)
2348                                 return log_oom();
2349                 }
2350 #ifdef GPT_ROOT_NATIVE
2351                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2352
2353                         if (root && nr >= root_nr)
2354                                 continue;
2355
2356                         root_nr = nr;
2357                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2358
2359                         free(root);
2360                         root = strdup(node);
2361                         if (!root)
2362                                 return log_oom();
2363                 }
2364 #endif
2365 #ifdef GPT_ROOT_SECONDARY
2366                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2367
2368                         if (secondary_root && nr >= secondary_root_nr)
2369                                 continue;
2370
2371                         secondary_root_nr = nr;
2372                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2373
2374
2375                         free(secondary_root);
2376                         secondary_root = strdup(node);
2377                         if (!secondary_root)
2378                                 return log_oom();
2379                 }
2380 #endif
2381         }
2382
2383         if (!root && !secondary_root) {
2384                 log_error("Failed to identify root partition in disk image %s.\n"
2385                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2386                 return -EINVAL;
2387         }
2388
2389         if (root) {
2390                 *root_device = root;
2391                 root = NULL;
2392
2393                 *root_device_rw = root_rw;
2394                 *secondary = false;
2395         } else if (secondary_root) {
2396                 *root_device = secondary_root;
2397                 secondary_root = NULL;
2398
2399                 *root_device_rw = secondary_root_rw;
2400                 *secondary = true;
2401         }
2402
2403         if (home) {
2404                 *home_device = home;
2405                 home = NULL;
2406
2407                 *home_device_rw = home_rw;
2408         }
2409
2410         if (srv) {
2411                 *srv_device = srv;
2412                 srv = NULL;
2413
2414                 *srv_device_rw = srv_rw;
2415         }
2416
2417         return 0;
2418 #else
2419         log_error("--image= is not supported, compiled without blkid support.");
2420         return -ENOTSUP;
2421 #endif
2422 }
2423
2424 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2425 #ifdef HAVE_BLKID
2426         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2427         const char *fstype, *p;
2428         int r;
2429
2430         assert(what);
2431         assert(where);
2432
2433         if (arg_read_only)
2434                 rw = false;
2435
2436         if (directory)
2437                 p = strappenda(where, directory);
2438         else
2439                 p = where;
2440
2441         errno = 0;
2442         b = blkid_new_probe_from_filename(what);
2443         if (!b) {
2444                 if (errno == 0)
2445                         return log_oom();
2446                 log_error("Failed to allocate prober for %s: %m", what);
2447                 return -errno;
2448         }
2449
2450         blkid_probe_enable_superblocks(b, 1);
2451         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2452
2453         errno = 0;
2454         r = blkid_do_safeprobe(b);
2455         if (r == -1 || r == 1) {
2456                 log_error("Cannot determine file system type of %s", what);
2457                 return -EINVAL;
2458         } else if (r != 0) {
2459                 if (errno == 0)
2460                         errno = EIO;
2461                 log_error("Failed to probe %s: %m", what);
2462                 return -errno;
2463         }
2464
2465         errno = 0;
2466         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2467                 if (errno == 0)
2468                         errno = EINVAL;
2469                 log_error("Failed to determine file system type of %s", what);
2470                 return -errno;
2471         }
2472
2473         if (streq(fstype, "crypto_LUKS")) {
2474                 log_error("nspawn currently does not support LUKS disk images.");
2475                 return -ENOTSUP;
2476         }
2477
2478         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2479                 log_error("Failed to mount %s: %m", what);
2480                 return -errno;
2481         }
2482
2483         return 0;
2484 #else
2485         log_error("--image= is not supported, compiled without blkid support.");
2486         return -ENOTSUP;
2487 #endif
2488 }
2489
2490 static int mount_devices(
2491                 const char *where,
2492                 const char *root_device, bool root_device_rw,
2493                 const char *home_device, bool home_device_rw,
2494                 const char *srv_device, bool srv_device_rw) {
2495         int r;
2496
2497         assert(where);
2498
2499         if (root_device) {
2500                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2501                 if (r < 0) {
2502                         log_error("Failed to mount root directory: %s", strerror(-r));
2503                         return r;
2504                 }
2505         }
2506
2507         if (home_device) {
2508                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2509                 if (r < 0) {
2510                         log_error("Failed to mount home directory: %s", strerror(-r));
2511                         return r;
2512                 }
2513         }
2514
2515         if (srv_device) {
2516                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2517                 if (r < 0) {
2518                         log_error("Failed to mount server data directory: %s", strerror(-r));
2519                         return r;
2520                 }
2521         }
2522
2523         return 0;
2524 }
2525
2526 static void loop_remove(int nr, int *image_fd) {
2527         _cleanup_close_ int control = -1;
2528
2529         if (nr < 0)
2530                 return;
2531
2532         if (image_fd && *image_fd >= 0) {
2533                 ioctl(*image_fd, LOOP_CLR_FD);
2534                 *image_fd = safe_close(*image_fd);
2535         }
2536
2537         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2538         if (control < 0)
2539                 return;
2540
2541         ioctl(control, LOOP_CTL_REMOVE, nr);
2542 }
2543
2544 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2545         int pipe_fds[2];
2546         pid_t pid;
2547
2548         assert(database);
2549         assert(key);
2550         assert(rpid);
2551
2552         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2553                 log_error("Failed to allocate pipe: %m");
2554                 return -errno;
2555         }
2556
2557         pid = fork();
2558         if (pid < 0) {
2559                 log_error("Failed to fork getent child: %m");
2560                 return -errno;
2561         } else if (pid == 0) {
2562                 int nullfd;
2563                 char *empty_env = NULL;
2564
2565                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2566                         _exit(EXIT_FAILURE);
2567
2568                 if (pipe_fds[0] > 2)
2569                         safe_close(pipe_fds[0]);
2570                 if (pipe_fds[1] > 2)
2571                         safe_close(pipe_fds[1]);
2572
2573                 nullfd = open("/dev/null", O_RDWR);
2574                 if (nullfd < 0)
2575                         _exit(EXIT_FAILURE);
2576
2577                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2578                         _exit(EXIT_FAILURE);
2579
2580                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2581                         _exit(EXIT_FAILURE);
2582
2583                 if (nullfd > 2)
2584                         safe_close(nullfd);
2585
2586                 reset_all_signal_handlers();
2587                 close_all_fds(NULL, 0);
2588
2589                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2590                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2591                 _exit(EXIT_FAILURE);
2592         }
2593
2594         pipe_fds[1] = safe_close(pipe_fds[1]);
2595
2596         *rpid = pid;
2597
2598         return pipe_fds[0];
2599 }
2600
2601 static int change_uid_gid(char **_home) {
2602         char line[LINE_MAX], *x, *u, *g, *h;
2603         const char *word, *state;
2604         _cleanup_free_ uid_t *uids = NULL;
2605         _cleanup_free_ char *home = NULL;
2606         _cleanup_fclose_ FILE *f = NULL;
2607         _cleanup_close_ int fd = -1;
2608         unsigned n_uids = 0;
2609         size_t sz = 0, l;
2610         uid_t uid;
2611         gid_t gid;
2612         pid_t pid;
2613         int r;
2614
2615         assert(_home);
2616
2617         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2618                 /* Reset everything fully to 0, just in case */
2619
2620                 if (setgroups(0, NULL) < 0) {
2621                         log_error("setgroups() failed: %m");
2622                         return -errno;
2623                 }
2624
2625                 if (setresgid(0, 0, 0) < 0) {
2626                         log_error("setregid() failed: %m");
2627                         return -errno;
2628                 }
2629
2630                 if (setresuid(0, 0, 0) < 0) {
2631                         log_error("setreuid() failed: %m");
2632                         return -errno;
2633                 }
2634
2635                 *_home = NULL;
2636                 return 0;
2637         }
2638
2639         /* First, get user credentials */
2640         fd = spawn_getent("passwd", arg_user, &pid);
2641         if (fd < 0)
2642                 return fd;
2643
2644         f = fdopen(fd, "r");
2645         if (!f)
2646                 return log_oom();
2647         fd = -1;
2648
2649         if (!fgets(line, sizeof(line), f)) {
2650
2651                 if (!ferror(f)) {
2652                         log_error("Failed to resolve user %s.", arg_user);
2653                         return -ESRCH;
2654                 }
2655
2656                 log_error("Failed to read from getent: %m");
2657                 return -errno;
2658         }
2659
2660         truncate_nl(line);
2661
2662         wait_for_terminate_and_warn("getent passwd", pid);
2663
2664         x = strchr(line, ':');
2665         if (!x) {
2666                 log_error("/etc/passwd entry has invalid user field.");
2667                 return -EIO;
2668         }
2669
2670         u = strchr(x+1, ':');
2671         if (!u) {
2672                 log_error("/etc/passwd entry has invalid password field.");
2673                 return -EIO;
2674         }
2675
2676         u++;
2677         g = strchr(u, ':');
2678         if (!g) {
2679                 log_error("/etc/passwd entry has invalid UID field.");
2680                 return -EIO;
2681         }
2682
2683         *g = 0;
2684         g++;
2685         x = strchr(g, ':');
2686         if (!x) {
2687                 log_error("/etc/passwd entry has invalid GID field.");
2688                 return -EIO;
2689         }
2690
2691         *x = 0;
2692         h = strchr(x+1, ':');
2693         if (!h) {
2694                 log_error("/etc/passwd entry has invalid GECOS field.");
2695                 return -EIO;
2696         }
2697
2698         h++;
2699         x = strchr(h, ':');
2700         if (!x) {
2701                 log_error("/etc/passwd entry has invalid home directory field.");
2702                 return -EIO;
2703         }
2704
2705         *x = 0;
2706
2707         r = parse_uid(u, &uid);
2708         if (r < 0) {
2709                 log_error("Failed to parse UID of user.");
2710                 return -EIO;
2711         }
2712
2713         r = parse_gid(g, &gid);
2714         if (r < 0) {
2715                 log_error("Failed to parse GID of user.");
2716                 return -EIO;
2717         }
2718
2719         home = strdup(h);
2720         if (!home)
2721                 return log_oom();
2722
2723         /* Second, get group memberships */
2724         fd = spawn_getent("initgroups", arg_user, &pid);
2725         if (fd < 0)
2726                 return fd;
2727
2728         fclose(f);
2729         f = fdopen(fd, "r");
2730         if (!f)
2731                 return log_oom();
2732         fd = -1;
2733
2734         if (!fgets(line, sizeof(line), f)) {
2735                 if (!ferror(f)) {
2736                         log_error("Failed to resolve user %s.", arg_user);
2737                         return -ESRCH;
2738                 }
2739
2740                 log_error("Failed to read from getent: %m");
2741                 return -errno;
2742         }
2743
2744         truncate_nl(line);
2745
2746         wait_for_terminate_and_warn("getent initgroups", pid);
2747
2748         /* Skip over the username and subsequent separator whitespace */
2749         x = line;
2750         x += strcspn(x, WHITESPACE);
2751         x += strspn(x, WHITESPACE);
2752
2753         FOREACH_WORD(word, l, x, state) {
2754                 char c[l+1];
2755
2756                 memcpy(c, word, l);
2757                 c[l] = 0;
2758
2759                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2760                         return log_oom();
2761
2762                 r = parse_uid(c, &uids[n_uids++]);
2763                 if (r < 0) {
2764                         log_error("Failed to parse group data from getent.");
2765                         return -EIO;
2766                 }
2767         }
2768
2769         r = mkdir_parents(home, 0775);
2770         if (r < 0) {
2771                 log_error("Failed to make home root directory: %s", strerror(-r));
2772                 return r;
2773         }
2774
2775         r = mkdir_safe(home, 0755, uid, gid);
2776         if (r < 0 && r != -EEXIST) {
2777                 log_error("Failed to make home directory: %s", strerror(-r));
2778                 return r;
2779         }
2780
2781         fchown(STDIN_FILENO, uid, gid);
2782         fchown(STDOUT_FILENO, uid, gid);
2783         fchown(STDERR_FILENO, uid, gid);
2784
2785         if (setgroups(n_uids, uids) < 0) {
2786                 log_error("Failed to set auxiliary groups: %m");
2787                 return -errno;
2788         }
2789
2790         if (setresgid(gid, gid, gid) < 0) {
2791                 log_error("setregid() failed: %m");
2792                 return -errno;
2793         }
2794
2795         if (setresuid(uid, uid, uid) < 0) {
2796                 log_error("setreuid() failed: %m");
2797                 return -errno;
2798         }
2799
2800         if (_home) {
2801                 *_home = home;
2802                 home = NULL;
2803         }
2804
2805         return 0;
2806 }
2807
2808 /*
2809  * Return values:
2810  * < 0 : wait_for_terminate() failed to get the state of the
2811  *       container, the container was terminated by a signal, or
2812  *       failed for an unknown reason.  No change is made to the
2813  *       container argument.
2814  * > 0 : The program executed in the container terminated with an
2815  *       error.  The exit code of the program executed in the
2816  *       container is returned.  No change is made to the container
2817  *       argument.
2818  *   0 : The container is being rebooted, has been shut down or exited
2819  *       successfully.  The container argument has been set to either
2820  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2821  *
2822  * That is, success is indicated by a return value of zero, and an
2823  * error is indicated by a non-zero value.
2824  */
2825 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2826         int r;
2827         siginfo_t status;
2828
2829         r = wait_for_terminate(pid, &status);
2830         if (r < 0) {
2831                 log_warning("Failed to wait for container: %s", strerror(-r));
2832                 return r;
2833         }
2834
2835         switch (status.si_code) {
2836         case CLD_EXITED:
2837                 r = status.si_status;
2838                 if (r == 0) {
2839                         if (!arg_quiet)
2840                                 log_debug("Container %s exited successfully.",
2841                                           arg_machine);
2842
2843                         *container = CONTAINER_TERMINATED;
2844                 } else {
2845                         log_error("Container %s failed with error code %i.",
2846                                   arg_machine, status.si_status);
2847                 }
2848                 break;
2849
2850         case CLD_KILLED:
2851                 if (status.si_status == SIGINT) {
2852                         if (!arg_quiet)
2853                                 log_info("Container %s has been shut down.",
2854                                          arg_machine);
2855
2856                         *container = CONTAINER_TERMINATED;
2857                         r = 0;
2858                         break;
2859                 } else if (status.si_status == SIGHUP) {
2860                         if (!arg_quiet)
2861                                 log_info("Container %s is being rebooted.",
2862                                          arg_machine);
2863
2864                         *container = CONTAINER_REBOOTED;
2865                         r = 0;
2866                         break;
2867                 }
2868                 /* CLD_KILLED fallthrough */
2869
2870         case CLD_DUMPED:
2871                 log_error("Container %s terminated by signal %s.",
2872                           arg_machine, signal_to_string(status.si_status));
2873                 r = -1;
2874                 break;
2875
2876         default:
2877                 log_error("Container %s failed due to unknown reason.",
2878                           arg_machine);
2879                 r = -1;
2880                 break;
2881         }
2882
2883         return r;
2884 }
2885
2886 static void nop_handler(int sig) {}
2887
2888 int main(int argc, char *argv[]) {
2889
2890         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2891         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2892         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2893         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2894         _cleanup_fdset_free_ FDSet *fds = NULL;
2895         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2896         const char *console = NULL;
2897         char veth_name[IFNAMSIZ];
2898         bool secondary = false;
2899         sigset_t mask, mask_chld;
2900         pid_t pid = 0;
2901
2902         log_parse_environment();
2903         log_open();
2904
2905         k = parse_argv(argc, argv);
2906         if (k < 0)
2907                 goto finish;
2908         else if (k == 0) {
2909                 r = EXIT_SUCCESS;
2910                 goto finish;
2911         }
2912
2913         if (!arg_image) {
2914                 if (arg_directory) {
2915                         char *p;
2916
2917                         p = path_make_absolute_cwd(arg_directory);
2918                         free(arg_directory);
2919                         arg_directory = p;
2920                 } else
2921                         arg_directory = get_current_dir_name();
2922
2923                 if (!arg_directory) {
2924                         log_error("Failed to determine path, please use -D.");
2925                         goto finish;
2926                 }
2927                 path_kill_slashes(arg_directory);
2928         }
2929
2930         if (!arg_machine) {
2931                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2932                 if (!arg_machine) {
2933                         log_oom();
2934                         goto finish;
2935                 }
2936
2937                 hostname_cleanup(arg_machine, false);
2938                 if (isempty(arg_machine)) {
2939                         log_error("Failed to determine machine name automatically, please use -M.");
2940                         goto finish;
2941                 }
2942         }
2943
2944         if (geteuid() != 0) {
2945                 log_error("Need to be root.");
2946                 goto finish;
2947         }
2948
2949         if (sd_booted() <= 0) {
2950                 log_error("Not running on a systemd system.");
2951                 goto finish;
2952         }
2953
2954         log_close();
2955         n_fd_passed = sd_listen_fds(false);
2956         if (n_fd_passed > 0) {
2957                 k = fdset_new_listen_fds(&fds, false);
2958                 if (k < 0) {
2959                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2960                         goto finish;
2961                 }
2962         }
2963         fdset_close_others(fds);
2964         log_open();
2965
2966         if (arg_directory) {
2967                 if (path_equal(arg_directory, "/")) {
2968                         log_error("Spawning container on root directory not supported.");
2969                         goto finish;
2970                 }
2971
2972                 if (arg_boot) {
2973                         if (path_is_os_tree(arg_directory) <= 0) {
2974                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2975                                 goto finish;
2976                         }
2977                 } else {
2978                         const char *p;
2979
2980                         p = strappenda(arg_directory,
2981                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2982                         if (access(p, F_OK) < 0) {
2983                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2984                                 goto finish;
2985
2986                         }
2987                 }
2988         } else {
2989                 char template[] = "/tmp/nspawn-root-XXXXXX";
2990
2991                 if (!mkdtemp(template)) {
2992                         log_error("Failed to create temporary directory: %m");
2993                         r = -errno;
2994                         goto finish;
2995                 }
2996
2997                 arg_directory = strdup(template);
2998                 if (!arg_directory) {
2999                         r = log_oom();
3000                         goto finish;
3001                 }
3002
3003                 image_fd = setup_image(&device_path, &loop_nr);
3004                 if (image_fd < 0) {
3005                         r = image_fd;
3006                         goto finish;
3007                 }
3008
3009                 r = dissect_image(image_fd,
3010                                   &root_device, &root_device_rw,
3011                                   &home_device, &home_device_rw,
3012                                   &srv_device, &srv_device_rw,
3013                                   &secondary);
3014                 if (r < 0)
3015                         goto finish;
3016         }
3017
3018         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3019         if (master < 0) {
3020                 log_error("Failed to acquire pseudo tty: %m");
3021                 goto finish;
3022         }
3023
3024         console = ptsname(master);
3025         if (!console) {
3026                 log_error("Failed to determine tty name: %m");
3027                 goto finish;
3028         }
3029
3030         if (!arg_quiet)
3031                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3032                          arg_machine, arg_image ? arg_image : arg_directory);
3033
3034         if (unlockpt(master) < 0) {
3035                 log_error("Failed to unlock tty: %m");
3036                 goto finish;
3037         }
3038
3039         if (access("/dev/kdbus/control", F_OK) >= 0) {
3040
3041                 if (arg_share_system) {
3042                         kdbus_domain = strdup("/dev/kdbus");
3043                         if (!kdbus_domain) {
3044                                 log_oom();
3045                                 goto finish;
3046                         }
3047                 } else {
3048                         const char *ns;
3049
3050                         ns = strappenda("machine-", arg_machine);
3051                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3052                         if (r < 0)
3053                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3054                         else
3055                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3056                 }
3057         }
3058
3059         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3060                 log_error("Failed to create kmsg socket pair: %m");
3061                 goto finish;
3062         }
3063
3064         sd_notify(0, "READY=1");
3065
3066         assert_se(sigemptyset(&mask) == 0);
3067         assert_se(sigemptyset(&mask_chld) == 0);
3068         sigaddset(&mask_chld, SIGCHLD);
3069         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3070         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3071
3072         for (;;) {
3073                 ContainerStatus container_status;
3074                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3075                 struct sigaction sa = {
3076                         .sa_handler = nop_handler,
3077                         .sa_flags = SA_NOCLDSTOP,
3078                 };
3079
3080                 r = barrier_create(&barrier);
3081                 if (r < 0) {
3082                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3083                         goto finish;
3084                 }
3085
3086                 /* Child can be killed before execv(), so handle SIGCHLD
3087                  * in order to interrupt parent's blocking calls and
3088                  * give it a chance to call wait() and terminate. */
3089                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3090                 if (r < 0) {
3091                         log_error("Failed to change the signal mask: %m");
3092                         goto finish;
3093                 }
3094
3095                 r = sigaction(SIGCHLD, &sa, NULL);
3096                 if (r < 0) {
3097                         log_error("Failed to install SIGCHLD handler: %m");
3098                         goto finish;
3099                 }
3100
3101                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3102                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3103                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3104                 if (pid < 0) {
3105                         if (errno == EINVAL)
3106                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3107                         else
3108                                 log_error("clone() failed: %m");
3109
3110                         r = pid;
3111                         goto finish;
3112                 }
3113
3114                 if (pid == 0) {
3115                         /* child */
3116                         _cleanup_free_ char *home = NULL;
3117                         unsigned n_env = 2;
3118                         const char *envp[] = {
3119                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3120                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3121                                 NULL, /* TERM */
3122                                 NULL, /* HOME */
3123                                 NULL, /* USER */
3124                                 NULL, /* LOGNAME */
3125                                 NULL, /* container_uuid */
3126                                 NULL, /* LISTEN_FDS */
3127                                 NULL, /* LISTEN_PID */
3128                                 NULL
3129                         };
3130                         char **env_use;
3131
3132                         barrier_set_role(&barrier, BARRIER_CHILD);
3133
3134                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3135                         if (envp[n_env])
3136                                 n_env ++;
3137
3138                         master = safe_close(master);
3139
3140                         close_nointr(STDIN_FILENO);
3141                         close_nointr(STDOUT_FILENO);
3142                         close_nointr(STDERR_FILENO);
3143
3144                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3145
3146                         reset_all_signal_handlers();
3147
3148                         assert_se(sigemptyset(&mask) == 0);
3149                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3150
3151                         k = open_terminal(console, O_RDWR);
3152                         if (k != STDIN_FILENO) {
3153                                 if (k >= 0) {
3154                                         safe_close(k);
3155                                         k = -EINVAL;
3156                                 }
3157
3158                                 log_error("Failed to open console: %s", strerror(-k));
3159                                 _exit(EXIT_FAILURE);
3160                         }
3161
3162                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3163                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3164                                 log_error("Failed to duplicate console: %m");
3165                                 _exit(EXIT_FAILURE);
3166                         }
3167
3168                         if (setsid() < 0) {
3169                                 log_error("setsid() failed: %m");
3170                                 _exit(EXIT_FAILURE);
3171                         }
3172
3173                         if (reset_audit_loginuid() < 0)
3174                                 _exit(EXIT_FAILURE);
3175
3176                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3177                                 log_error("PR_SET_PDEATHSIG failed: %m");
3178                                 _exit(EXIT_FAILURE);
3179                         }
3180
3181                         /* Mark everything as slave, so that we still
3182                          * receive mounts from the real root, but don't
3183                          * propagate mounts to the real root. */
3184                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3185                                 log_error("MS_SLAVE|MS_REC failed: %m");
3186                                 _exit(EXIT_FAILURE);
3187                         }
3188
3189                         if (mount_devices(arg_directory,
3190                                           root_device, root_device_rw,
3191                                           home_device, home_device_rw,
3192                                           srv_device, srv_device_rw) < 0)
3193                                 _exit(EXIT_FAILURE);
3194
3195                         /* Turn directory into bind mount */
3196                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3197                                 log_error("Failed to make bind mount: %m");
3198                                 _exit(EXIT_FAILURE);
3199                         }
3200
3201                         r = setup_volatile(arg_directory);
3202                         if (r < 0)
3203                                 _exit(EXIT_FAILURE);
3204
3205                         if (setup_volatile_state(arg_directory) < 0)
3206                                 _exit(EXIT_FAILURE);
3207
3208                         r = base_filesystem_create(arg_directory);
3209                         if (r < 0)
3210                                 _exit(EXIT_FAILURE);
3211
3212                         if (arg_read_only) {
3213                                 k = bind_remount_recursive(arg_directory, true);
3214                                 if (k < 0) {
3215                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3216                                         _exit(EXIT_FAILURE);
3217                                 }
3218                         }
3219
3220                         if (mount_all(arg_directory) < 0)
3221                                 _exit(EXIT_FAILURE);
3222
3223                         if (copy_devnodes(arg_directory) < 0)
3224                                 _exit(EXIT_FAILURE);
3225
3226                         if (setup_ptmx(arg_directory) < 0)
3227                                 _exit(EXIT_FAILURE);
3228
3229                         dev_setup(arg_directory);
3230
3231                         if (setup_seccomp() < 0)
3232                                 _exit(EXIT_FAILURE);
3233
3234                         if (setup_dev_console(arg_directory, console) < 0)
3235                                 _exit(EXIT_FAILURE);
3236
3237                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3238                                 _exit(EXIT_FAILURE);
3239
3240                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3241
3242                         if (setup_boot_id(arg_directory) < 0)
3243                                 _exit(EXIT_FAILURE);
3244
3245                         if (setup_timezone(arg_directory) < 0)
3246                                 _exit(EXIT_FAILURE);
3247
3248                         if (setup_resolv_conf(arg_directory) < 0)
3249                                 _exit(EXIT_FAILURE);
3250
3251                         if (setup_journal(arg_directory) < 0)
3252                                 _exit(EXIT_FAILURE);
3253
3254                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3255                                 _exit(EXIT_FAILURE);
3256
3257                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3258                                 _exit(EXIT_FAILURE);
3259
3260                         if (mount_tmpfs(arg_directory) < 0)
3261                                 _exit(EXIT_FAILURE);
3262
3263                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3264                                 _exit(EXIT_FAILURE);
3265
3266                         /* Tell the parent that we are ready, and that
3267                          * it can cgroupify us to that we lack access
3268                          * to certain devices and resources. */
3269                         barrier_place(&barrier);
3270
3271                         if (chdir(arg_directory) < 0) {
3272                                 log_error("chdir(%s) failed: %m", arg_directory);
3273                                 _exit(EXIT_FAILURE);
3274                         }
3275
3276                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3277                                 log_error("mount(MS_MOVE) failed: %m");
3278                                 _exit(EXIT_FAILURE);
3279                         }
3280
3281                         if (chroot(".") < 0) {
3282                                 log_error("chroot() failed: %m");
3283                                 _exit(EXIT_FAILURE);
3284                         }
3285
3286                         if (chdir("/") < 0) {
3287                                 log_error("chdir() failed: %m");
3288                                 _exit(EXIT_FAILURE);
3289                         }
3290
3291                         umask(0022);
3292
3293                         if (arg_private_network)
3294                                 loopback_setup();
3295
3296                         if (drop_capabilities() < 0) {
3297                                 log_error("drop_capabilities() failed: %m");
3298                                 _exit(EXIT_FAILURE);
3299                         }
3300
3301                         r = change_uid_gid(&home);
3302                         if (r < 0)
3303                                 _exit(EXIT_FAILURE);
3304
3305                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3306                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3307                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3308                                 log_oom();
3309                                 _exit(EXIT_FAILURE);
3310                         }
3311
3312                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3313                                 char as_uuid[37];
3314
3315                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3316                                         log_oom();
3317                                         _exit(EXIT_FAILURE);
3318                                 }
3319                         }
3320
3321                         if (fdset_size(fds) > 0) {
3322                                 k = fdset_cloexec(fds, false);
3323                                 if (k < 0) {
3324                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3325                                         _exit(EXIT_FAILURE);
3326                                 }
3327
3328                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3329                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3330                                         log_oom();
3331                                         _exit(EXIT_FAILURE);
3332                                 }
3333                         }
3334
3335                         setup_hostname();
3336
3337                         if (arg_personality != 0xffffffffLU) {
3338                                 if (personality(arg_personality) < 0) {
3339                                         log_error("personality() failed: %m");
3340                                         _exit(EXIT_FAILURE);
3341                                 }
3342                         } else if (secondary) {
3343                                 if (personality(PER_LINUX32) < 0) {
3344                                         log_error("personality() failed: %m");
3345                                         _exit(EXIT_FAILURE);
3346                                 }
3347                         }
3348
3349 #ifdef HAVE_SELINUX
3350                         if (arg_selinux_context)
3351                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3352                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3353                                         _exit(EXIT_FAILURE);
3354                                 }
3355 #endif
3356
3357                         if (!strv_isempty(arg_setenv)) {
3358                                 char **n;
3359
3360                                 n = strv_env_merge(2, envp, arg_setenv);
3361                                 if (!n) {
3362                                         log_oom();
3363                                         _exit(EXIT_FAILURE);
3364                                 }
3365
3366                                 env_use = n;
3367                         } else
3368                                 env_use = (char**) envp;
3369
3370                         /* Wait until the parent is ready with the setup, too... */
3371                         if (!barrier_place_and_sync(&barrier))
3372                                 _exit(EXIT_FAILURE);
3373
3374                         if (arg_boot) {
3375                                 char **a;
3376                                 size_t l;
3377
3378                                 /* Automatically search for the init system */
3379
3380                                 l = 1 + argc - optind;
3381                                 a = newa(char*, l + 1);
3382                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3383
3384                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3385                                 execve(a[0], a, env_use);
3386
3387                                 a[0] = (char*) "/lib/systemd/systemd";
3388                                 execve(a[0], a, env_use);
3389
3390                                 a[0] = (char*) "/sbin/init";
3391                                 execve(a[0], a, env_use);
3392                         } else if (argc > optind)
3393                                 execvpe(argv[optind], argv + optind, env_use);
3394                         else {
3395                                 chdir(home ? home : "/root");
3396                                 execle("/bin/bash", "-bash", NULL, env_use);
3397                                 execle("/bin/sh", "-sh", NULL, env_use);
3398                         }
3399
3400                         log_error("execv() failed: %m");
3401                         _exit(EXIT_FAILURE);
3402                 }
3403
3404                 barrier_set_role(&barrier, BARRIER_PARENT);
3405                 fdset_free(fds);
3406                 fds = NULL;
3407
3408                 /* wait for child-setup to be done */
3409                 if (barrier_place_and_sync(&barrier)) {
3410                         int ifi = 0;
3411
3412                         r = move_network_interfaces(pid);
3413                         if (r < 0)
3414                                 goto finish;
3415
3416                         r = setup_veth(pid, veth_name, &ifi);
3417                         if (r < 0)
3418                                 goto finish;
3419
3420                         r = setup_bridge(veth_name, &ifi);
3421                         if (r < 0)
3422                                 goto finish;
3423
3424                         r = setup_macvlan(pid);
3425                         if (r < 0)
3426                                 goto finish;
3427
3428                         r = register_machine(pid, ifi);
3429                         if (r < 0)
3430                                 goto finish;
3431
3432                         /* Block SIGCHLD here, before notifying child.
3433                          * process_pty() will handle it with the other signals. */
3434                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3435                         if (r < 0)
3436                                 goto finish;
3437
3438                         /* Reset signal to default */
3439                         r = default_signals(SIGCHLD, -1);
3440                         if (r < 0)
3441                                 goto finish;
3442
3443                         /* Notify the child that the parent is ready with all
3444                          * its setup, and that the child can now hand over
3445                          * control to the code to run inside the container. */
3446                         barrier_place(&barrier);
3447
3448                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3449                         if (k < 0) {
3450                                 r = EXIT_FAILURE;
3451                                 break;
3452                         }
3453
3454                         if (!arg_quiet)
3455                                 putc('\n', stdout);
3456
3457                         /* Kill if it is not dead yet anyway */
3458                         terminate_machine(pid);
3459                 }
3460
3461                 /* Normally redundant, but better safe than sorry */
3462                 kill(pid, SIGKILL);
3463
3464                 r = wait_for_container(pid, &container_status);
3465                 pid = 0;
3466
3467                 if (r < 0) {
3468                         /* We failed to wait for the container, or the
3469                          * container exited abnormally */
3470                         r = EXIT_FAILURE;
3471                         break;
3472                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3473                         /* The container exited with a non-zero
3474                          * status, or with zero status and no reboot
3475                          * was requested. */
3476                         break;
3477
3478                 /* CONTAINER_REBOOTED, loop again */
3479
3480                 if (arg_keep_unit) {
3481                         /* Special handling if we are running as a
3482                          * service: instead of simply restarting the
3483                          * machine we want to restart the entire
3484                          * service, so let's inform systemd about this
3485                          * with the special exit code 133. The service
3486                          * file uses RestartForceExitStatus=133 so
3487                          * that this results in a full nspawn
3488                          * restart. This is necessary since we might
3489                          * have cgroup parameters set we want to have
3490                          * flushed out. */
3491                         r = 133;
3492                         break;
3493                 }
3494         }
3495
3496 finish:
3497         loop_remove(loop_nr, &image_fd);
3498
3499         if (pid > 0)
3500                 kill(pid, SIGKILL);
3501
3502         free(arg_directory);
3503         free(arg_machine);
3504         free(arg_user);
3505         strv_free(arg_setenv);
3506         strv_free(arg_network_interfaces);
3507         strv_free(arg_network_macvlan);
3508         strv_free(arg_bind);
3509         strv_free(arg_bind_ro);
3510         strv_free(arg_tmpfs);
3511
3512         return r;
3513 }