chiark / gitweb /
76e86b7e05ea1593c2f9be3374840df574580adb
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172                "  -h --help                 Show this help\n"
173                "     --version              Print version string\n"
174                "  -q --quiet                Do not show status information\n"
175                "  -D --directory=PATH       Root directory for the container\n"
176                "  -i --image=PATH           File system device or image for the container\n"
177                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
178                "  -u --user=USER            Run the command under specified user or uid\n"
179                "  -M --machine=NAME         Set the machine name for the container\n"
180                "     --uuid=UUID            Set a specific machine UUID for the container\n"
181                "  -S --slice=SLICE          Place the container in the specified slice\n"
182                "     --private-network      Disable network in container\n"
183                "     --network-interface=INTERFACE\n"
184                "                            Assign an existing network interface to the\n"
185                "                            container\n"
186                "     --network-macvlan=INTERFACE\n"
187                "                            Create a macvlan network interface based on an\n"
188                "                            existing network interface to the container\n"
189                "     --network-veth         Add a virtual ethernet connection between host\n"
190                "                            and container\n"
191                "     --network-bridge=INTERFACE\n"
192                "                            Add a virtual ethernet connection between host\n"
193                "                            and container and add it to an existing bridge on\n"
194                "                            the host\n"
195                "  -Z --selinux-context=SECLABEL\n"
196                "                            Set the SELinux security context to be used by\n"
197                "                            processes in the container\n"
198                "  -L --selinux-apifs-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            API/tmpfs file systems in the container\n"
201                "     --capability=CAP       In addition to the default, retain specified\n"
202                "                            capability\n"
203                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
204                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
205                "  -j                        Equivalent to --link-journal=host\n"
206                "     --read-only            Mount the root directory read-only\n"
207                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
208                "                            the container\n"
209                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
210                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
212                "     --share-system         Share system namespaces with host\n"
213                "     --register=BOOLEAN     Register container as machine\n"
214                "     --keep-unit            Do not register a scope for the machine, reuse\n"
215                "                            the service unit nspawn is running in\n"
216                "     --volatile[=MODE]      Run the system in volatile mode\n",
217                program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222         enum {
223                 ARG_VERSION = 0x100,
224                 ARG_PRIVATE_NETWORK,
225                 ARG_UUID,
226                 ARG_READ_ONLY,
227                 ARG_CAPABILITY,
228                 ARG_DROP_CAPABILITY,
229                 ARG_LINK_JOURNAL,
230                 ARG_BIND,
231                 ARG_BIND_RO,
232                 ARG_TMPFS,
233                 ARG_SETENV,
234                 ARG_SHARE_SYSTEM,
235                 ARG_REGISTER,
236                 ARG_KEEP_UNIT,
237                 ARG_NETWORK_INTERFACE,
238                 ARG_NETWORK_MACVLAN,
239                 ARG_NETWORK_VETH,
240                 ARG_NETWORK_BRIDGE,
241                 ARG_PERSONALITY,
242                 ARG_VOLATILE,
243         };
244
245         static const struct option options[] = {
246                 { "help",                  no_argument,       NULL, 'h'                   },
247                 { "version",               no_argument,       NULL, ARG_VERSION           },
248                 { "directory",             required_argument, NULL, 'D'                   },
249                 { "user",                  required_argument, NULL, 'u'                   },
250                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
251                 { "boot",                  no_argument,       NULL, 'b'                   },
252                 { "uuid",                  required_argument, NULL, ARG_UUID              },
253                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
254                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
255                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
256                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
257                 { "bind",                  required_argument, NULL, ARG_BIND              },
258                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
259                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
260                 { "machine",               required_argument, NULL, 'M'                   },
261                 { "slice",                 required_argument, NULL, 'S'                   },
262                 { "setenv",                required_argument, NULL, ARG_SETENV            },
263                 { "selinux-context",       required_argument, NULL, 'Z'                   },
264                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
265                 { "quiet",                 no_argument,       NULL, 'q'                   },
266                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
267                 { "register",              required_argument, NULL, ARG_REGISTER          },
268                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
269                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
270                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
271                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
272                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
273                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
274                 { "image",                 required_argument, NULL, 'i'                   },
275                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
276                 {}
277         };
278
279         int c, r;
280         uint64_t plus = 0, minus = 0;
281
282         assert(argc >= 0);
283         assert(argv);
284
285         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287                 switch (c) {
288
289                 case 'h':
290                         help();
291                         return 0;
292
293                 case ARG_VERSION:
294                         puts(PACKAGE_STRING);
295                         puts(SYSTEMD_FEATURES);
296                         return 0;
297
298                 case 'D':
299                         free(arg_directory);
300                         arg_directory = canonicalize_file_name(optarg);
301                         if (!arg_directory) {
302                                 log_error("Invalid root directory: %m");
303                                 return -ENOMEM;
304                         }
305
306                         break;
307
308                 case 'i':
309                         arg_image = optarg;
310                         break;
311
312                 case 'u':
313                         free(arg_user);
314                         arg_user = strdup(optarg);
315                         if (!arg_user)
316                                 return log_oom();
317
318                         break;
319
320                 case ARG_NETWORK_BRIDGE:
321                         arg_network_bridge = optarg;
322
323                         /* fall through */
324
325                 case ARG_NETWORK_VETH:
326                         arg_network_veth = true;
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_INTERFACE:
331                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
332                                 return log_oom();
333
334                         arg_private_network = true;
335                         break;
336
337                 case ARG_NETWORK_MACVLAN:
338                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
339                                 return log_oom();
340
341                         /* fall through */
342
343                 case ARG_PRIVATE_NETWORK:
344                         arg_private_network = true;
345                         break;
346
347                 case 'b':
348                         arg_boot = true;
349                         break;
350
351                 case ARG_UUID:
352                         r = sd_id128_from_string(optarg, &arg_uuid);
353                         if (r < 0) {
354                                 log_error("Invalid UUID: %s", optarg);
355                                 return r;
356                         }
357                         break;
358
359                 case 'S':
360                         arg_slice = optarg;
361                         break;
362
363                 case 'M':
364                         if (isempty(optarg)) {
365                                 free(arg_machine);
366                                 arg_machine = NULL;
367                         } else {
368
369                                 if (!hostname_is_valid(optarg)) {
370                                         log_error("Invalid machine name: %s", optarg);
371                                         return -EINVAL;
372                                 }
373
374                                 free(arg_machine);
375                                 arg_machine = strdup(optarg);
376                                 if (!arg_machine)
377                                         return log_oom();
378
379                                 break;
380                         }
381
382                 case 'Z':
383                         arg_selinux_context = optarg;
384                         break;
385
386                 case 'L':
387                         arg_selinux_apifs_context = optarg;
388                         break;
389
390                 case ARG_READ_ONLY:
391                         arg_read_only = true;
392                         break;
393
394                 case ARG_CAPABILITY:
395                 case ARG_DROP_CAPABILITY: {
396                         const char *state, *word;
397                         size_t length;
398
399                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400                                 _cleanup_free_ char *t;
401                                 cap_value_t cap;
402
403                                 t = strndup(word, length);
404                                 if (!t)
405                                         return log_oom();
406
407                                 if (streq(t, "all")) {
408                                         if (c == ARG_CAPABILITY)
409                                                 plus = (uint64_t) -1;
410                                         else
411                                                 minus = (uint64_t) -1;
412                                 } else {
413                                         if (cap_from_name(t, &cap) < 0) {
414                                                 log_error("Failed to parse capability %s.", t);
415                                                 return -EINVAL;
416                                         }
417
418                                         if (c == ARG_CAPABILITY)
419                                                 plus |= 1ULL << (uint64_t) cap;
420                                         else
421                                                 minus |= 1ULL << (uint64_t) cap;
422                                 }
423                         }
424
425                         break;
426                 }
427
428                 case 'j':
429                         arg_link_journal = LINK_GUEST;
430                         break;
431
432                 case ARG_LINK_JOURNAL:
433                         if (streq(optarg, "auto"))
434                                 arg_link_journal = LINK_AUTO;
435                         else if (streq(optarg, "no"))
436                                 arg_link_journal = LINK_NO;
437                         else if (streq(optarg, "guest"))
438                                 arg_link_journal = LINK_GUEST;
439                         else if (streq(optarg, "host"))
440                                 arg_link_journal = LINK_HOST;
441                         else {
442                                 log_error("Failed to parse link journal mode %s", optarg);
443                                 return -EINVAL;
444                         }
445
446                         break;
447
448                 case ARG_BIND:
449                 case ARG_BIND_RO: {
450                         _cleanup_free_ char *a = NULL, *b = NULL;
451                         char *e;
452                         char ***x;
453
454                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456                         e = strchr(optarg, ':');
457                         if (e) {
458                                 a = strndup(optarg, e - optarg);
459                                 b = strdup(e + 1);
460                         } else {
461                                 a = strdup(optarg);
462                                 b = strdup(optarg);
463                         }
464
465                         if (!a || !b)
466                                 return log_oom();
467
468                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
469                                 log_error("Invalid bind mount specification: %s", optarg);
470                                 return -EINVAL;
471                         }
472
473                         r = strv_extend(x, a);
474                         if (r < 0)
475                                 return log_oom();
476
477                         r = strv_extend(x, b);
478                         if (r < 0)
479                                 return log_oom();
480
481                         break;
482                 }
483
484                 case ARG_TMPFS: {
485                         _cleanup_free_ char *a = NULL, *b = NULL;
486                         char *e;
487
488                         e = strchr(optarg, ':');
489                         if (e) {
490                                 a = strndup(optarg, e - optarg);
491                                 b = strdup(e + 1);
492                         } else {
493                                 a = strdup(optarg);
494                                 b = strdup("mode=0755");
495                         }
496
497                         if (!a || !b)
498                                 return log_oom();
499
500                         if (!path_is_absolute(a)) {
501                                 log_error("Invalid tmpfs specification: %s", optarg);
502                                 return -EINVAL;
503                         }
504
505                         r = strv_push(&arg_tmpfs, a);
506                         if (r < 0)
507                                 return log_oom();
508
509                         a = NULL;
510
511                         r = strv_push(&arg_tmpfs, b);
512                         if (r < 0)
513                                 return log_oom();
514
515                         b = NULL;
516
517                         break;
518                 }
519
520                 case ARG_SETENV: {
521                         char **n;
522
523                         if (!env_assignment_is_valid(optarg)) {
524                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
525                                 return -EINVAL;
526                         }
527
528                         n = strv_env_set(arg_setenv, optarg);
529                         if (!n)
530                                 return log_oom();
531
532                         strv_free(arg_setenv);
533                         arg_setenv = n;
534                         break;
535                 }
536
537                 case 'q':
538                         arg_quiet = true;
539                         break;
540
541                 case ARG_SHARE_SYSTEM:
542                         arg_share_system = true;
543                         break;
544
545                 case ARG_REGISTER:
546                         r = parse_boolean(optarg);
547                         if (r < 0) {
548                                 log_error("Failed to parse --register= argument: %s", optarg);
549                                 return r;
550                         }
551
552                         arg_register = r;
553                         break;
554
555                 case ARG_KEEP_UNIT:
556                         arg_keep_unit = true;
557                         break;
558
559                 case ARG_PERSONALITY:
560
561                         arg_personality = personality_from_string(optarg);
562                         if (arg_personality == 0xffffffffLU) {
563                                 log_error("Unknown or unsupported personality '%s'.", optarg);
564                                 return -EINVAL;
565                         }
566
567                         break;
568
569                 case ARG_VOLATILE:
570
571                         if (!optarg)
572                                 arg_volatile = VOLATILE_YES;
573                         else {
574                                 r = parse_boolean(optarg);
575                                 if (r < 0) {
576                                         if (streq(optarg, "state"))
577                                                 arg_volatile = VOLATILE_STATE;
578                                         else {
579                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
580                                                 return r;
581                                         }
582                                 } else
583                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584                         }
585
586                         break;
587
588                 case '?':
589                         return -EINVAL;
590
591                 default:
592                         assert_not_reached("Unhandled option");
593                 }
594
595         if (arg_share_system)
596                 arg_register = false;
597
598         if (arg_boot && arg_share_system) {
599                 log_error("--boot and --share-system may not be combined.");
600                 return -EINVAL;
601         }
602
603         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604                 log_error("--keep-unit may not be used when invoked from a user session.");
605                 return -EINVAL;
606         }
607
608         if (arg_directory && arg_image) {
609                 log_error("--directory= and --image= may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_volatile != VOLATILE_NO && arg_read_only) {
614                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615                 return -EINVAL;
616         }
617
618         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620         return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625         typedef struct MountPoint {
626                 const char *what;
627                 const char *where;
628                 const char *type;
629                 const char *options;
630                 unsigned long flags;
631                 bool fatal;
632         } MountPoint;
633
634         static const MountPoint mount_table[] = {
635                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
636                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
637                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
638                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
639                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
640                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
642                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643 #ifdef HAVE_SELINUX
644                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
645                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
646 #endif
647         };
648
649         unsigned k;
650         int r = 0;
651
652         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653                 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655                 _cleanup_free_ char *options = NULL;
656 #endif
657                 const char *o;
658                 int t;
659
660                 where = strjoin(dest, "/", mount_table[k].where, NULL);
661                 if (!where)
662                         return log_oom();
663
664                 t = path_is_mount_point(where, true);
665                 if (t < 0) {
666                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668                         if (r == 0)
669                                 r = t;
670
671                         continue;
672                 }
673
674                 /* Skip this entry if it is not a remount. */
675                 if (mount_table[k].what && t > 0)
676                         continue;
677
678                 mkdir_p(where, 0755);
679
680 #ifdef HAVE_SELINUX
681                 if (arg_selinux_apifs_context &&
682                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
683                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
684                         if (!options)
685                                 return log_oom();
686
687                         o = options;
688                 } else
689 #endif
690                         o = mount_table[k].options;
691
692
693                 if (mount(mount_table[k].what,
694                           where,
695                           mount_table[k].type,
696                           mount_table[k].flags,
697                           o) < 0 &&
698                     mount_table[k].fatal) {
699
700                         log_error("mount(%s) failed: %m", where);
701
702                         if (r == 0)
703                                 r = -errno;
704                 }
705         }
706
707         return r;
708 }
709
710 static int mount_binds(const char *dest, char **l, bool ro) {
711         char **x, **y;
712
713         STRV_FOREACH_PAIR(x, y, l) {
714                 _cleanup_free_ char *where = NULL;
715                 struct stat source_st, dest_st;
716                 int r;
717
718                 if (stat(*x, &source_st) < 0) {
719                         log_error("Failed to stat %s: %m", *x);
720                         return -errno;
721                 }
722
723                 where = strappend(dest, *y);
724                 if (!where)
725                         return log_oom();
726
727                 r = stat(where, &dest_st);
728                 if (r == 0) {
729                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
730                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
731                                 return -EINVAL;
732                         }
733                 } else if (errno == ENOENT) {
734                         r = mkdir_parents_label(where, 0755);
735                         if (r < 0) {
736                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
737                                 return r;
738                         }
739                 } else {
740                         log_error("Failed to bind mount %s: %m", *x);
741                         return -errno;
742                 }
743
744                 /* Create the mount point, but be conservative -- refuse to create block
745                  * and char devices. */
746                 if (S_ISDIR(source_st.st_mode))
747                         mkdir_label(where, 0755);
748                 else if (S_ISFIFO(source_st.st_mode))
749                         mkfifo(where, 0644);
750                 else if (S_ISSOCK(source_st.st_mode))
751                         mknod(where, 0644 | S_IFSOCK, 0);
752                 else if (S_ISREG(source_st.st_mode))
753                         touch(where);
754                 else {
755                         log_error("Refusing to create mountpoint for file: %s", *x);
756                         return -ENOTSUP;
757                 }
758
759                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
760                         log_error("mount(%s) failed: %m", where);
761                         return -errno;
762                 }
763
764                 if (ro) {
765                         r = bind_remount_recursive(where, true);
766                         if (r < 0) {
767                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
768                                 return r;
769                         }
770                 }
771         }
772
773         return 0;
774 }
775
776 static int mount_tmpfs(const char *dest) {
777         char **i, **o;
778
779         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
780                 _cleanup_free_ char *where = NULL;
781
782                 where = strappend(dest, *i);
783                 if (!where)
784                         return log_oom();
785
786                 mkdir_label(where, 0755);
787
788                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
789                         log_error("tmpfs mount to %s failed: %m", where);
790                         return -errno;
791                 }
792         }
793
794         return 0;
795 }
796
797 static int setup_timezone(const char *dest) {
798         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
799         char *z, *y;
800         int r;
801
802         assert(dest);
803
804         /* Fix the timezone, if possible */
805         r = readlink_malloc("/etc/localtime", &p);
806         if (r < 0) {
807                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
808                 return 0;
809         }
810
811         z = path_startswith(p, "../usr/share/zoneinfo/");
812         if (!z)
813                 z = path_startswith(p, "/usr/share/zoneinfo/");
814         if (!z) {
815                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
816                 return 0;
817         }
818
819         where = strappend(dest, "/etc/localtime");
820         if (!where)
821                 return log_oom();
822
823         r = readlink_malloc(where, &q);
824         if (r >= 0) {
825                 y = path_startswith(q, "../usr/share/zoneinfo/");
826                 if (!y)
827                         y = path_startswith(q, "/usr/share/zoneinfo/");
828
829                 /* Already pointing to the right place? Then do nothing .. */
830                 if (y && streq(y, z))
831                         return 0;
832         }
833
834         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
835         if (!check)
836                 return log_oom();
837
838         if (access(check, F_OK) < 0) {
839                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
840                 return 0;
841         }
842
843         what = strappend("../usr/share/zoneinfo/", z);
844         if (!what)
845                 return log_oom();
846
847         mkdir_parents(where, 0755);
848         unlink(where);
849
850         if (symlink(what, where) < 0) {
851                 log_error("Failed to correct timezone of container: %m");
852                 return 0;
853         }
854
855         return 0;
856 }
857
858 static int setup_resolv_conf(const char *dest) {
859         _cleanup_free_ char *where = NULL;
860
861         assert(dest);
862
863         if (arg_private_network)
864                 return 0;
865
866         /* Fix resolv.conf, if possible */
867         where = strappend(dest, "/etc/resolv.conf");
868         if (!where)
869                 return log_oom();
870
871         /* We don't really care for the results of this really. If it
872          * fails, it fails, but meh... */
873         mkdir_parents(where, 0755);
874         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
875
876         return 0;
877 }
878
879 static int setup_volatile_state(const char *directory) {
880         const char *p;
881         int r;
882
883         assert(directory);
884
885         if (arg_volatile != VOLATILE_STATE)
886                 return 0;
887
888         /* --volatile=state means we simply overmount /var
889            with a tmpfs, and the rest read-only. */
890
891         r = bind_remount_recursive(directory, true);
892         if (r < 0) {
893                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
894                 return r;
895         }
896
897         p = strappenda(directory, "/var");
898         mkdir(p, 0755);
899
900         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
901                 log_error("Failed to mount tmpfs to /var: %m");
902                 return -errno;
903         }
904
905         return 0;
906 }
907
908 static int setup_volatile(const char *directory) {
909         bool tmpfs_mounted = false, bind_mounted = false;
910         char template[] = "/tmp/nspawn-volatile-XXXXXX";
911         const char *f, *t;
912         int r;
913
914         assert(directory);
915
916         if (arg_volatile != VOLATILE_YES)
917                 return 0;
918
919         /* --volatile=yes means we mount a tmpfs to the root dir, and
920            the original /usr to use inside it, and that read-only. */
921
922         if (!mkdtemp(template)) {
923                 log_error("Failed to create temporary directory: %m");
924                 return -errno;
925         }
926
927         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
928                 log_error("Failed to mount tmpfs for root directory: %m");
929                 r = -errno;
930                 goto fail;
931         }
932
933         tmpfs_mounted = true;
934
935         f = strappenda(directory, "/usr");
936         t = strappenda(template, "/usr");
937
938         mkdir(t, 0755);
939         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
940                 log_error("Failed to create /usr bind mount: %m");
941                 r = -errno;
942                 goto fail;
943         }
944
945         bind_mounted = true;
946
947         r = bind_remount_recursive(t, true);
948         if (r < 0) {
949                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
950                 goto fail;
951         }
952
953         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
954                 log_error("Failed to move root mount: %m");
955                 r = -errno;
956                 goto fail;
957         }
958
959         rmdir(template);
960
961         return 0;
962
963 fail:
964         if (bind_mounted)
965                 umount(t);
966         if (tmpfs_mounted)
967                 umount(template);
968         rmdir(template);
969         return r;
970 }
971
972 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
973
974         snprintf(s, 37,
975                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
976                  SD_ID128_FORMAT_VAL(id));
977
978         return s;
979 }
980
981 static int setup_boot_id(const char *dest) {
982         _cleanup_free_ char *from = NULL, *to = NULL;
983         sd_id128_t rnd = {};
984         char as_uuid[37];
985         int r;
986
987         assert(dest);
988
989         if (arg_share_system)
990                 return 0;
991
992         /* Generate a new randomized boot ID, so that each boot-up of
993          * the container gets a new one */
994
995         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
996         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
997         if (!from || !to)
998                 return log_oom();
999
1000         r = sd_id128_randomize(&rnd);
1001         if (r < 0) {
1002                 log_error("Failed to generate random boot id: %s", strerror(-r));
1003                 return r;
1004         }
1005
1006         id128_format_as_uuid(rnd, as_uuid);
1007
1008         r = write_string_file(from, as_uuid);
1009         if (r < 0) {
1010                 log_error("Failed to write boot id: %s", strerror(-r));
1011                 return r;
1012         }
1013
1014         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1015                 log_error("Failed to bind mount boot id: %m");
1016                 r = -errno;
1017         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1018                 log_warning("Failed to make boot id read-only: %m");
1019
1020         unlink(from);
1021         return r;
1022 }
1023
1024 static int copy_devnodes(const char *dest) {
1025
1026         static const char devnodes[] =
1027                 "null\0"
1028                 "zero\0"
1029                 "full\0"
1030                 "random\0"
1031                 "urandom\0"
1032                 "tty\0";
1033
1034         const char *d;
1035         int r = 0;
1036         _cleanup_umask_ mode_t u;
1037
1038         assert(dest);
1039
1040         u = umask(0000);
1041
1042         NULSTR_FOREACH(d, devnodes) {
1043                 _cleanup_free_ char *from = NULL, *to = NULL;
1044                 struct stat st;
1045
1046                 from = strappend("/dev/", d);
1047                 to = strjoin(dest, "/dev/", d, NULL);
1048                 if (!from || !to)
1049                         return log_oom();
1050
1051                 if (stat(from, &st) < 0) {
1052
1053                         if (errno != ENOENT) {
1054                                 log_error("Failed to stat %s: %m", from);
1055                                 return -errno;
1056                         }
1057
1058                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1059
1060                         log_error("%s is not a char or block device, cannot copy", from);
1061                         return -EIO;
1062
1063                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1064
1065                         log_error("mknod(%s) failed: %m", dest);
1066                         return  -errno;
1067                 }
1068         }
1069
1070         return r;
1071 }
1072
1073 static int setup_ptmx(const char *dest) {
1074         _cleanup_free_ char *p = NULL;
1075
1076         p = strappend(dest, "/dev/ptmx");
1077         if (!p)
1078                 return log_oom();
1079
1080         if (symlink("pts/ptmx", p) < 0) {
1081                 log_error("Failed to create /dev/ptmx symlink: %m");
1082                 return -errno;
1083         }
1084
1085         return 0;
1086 }
1087
1088 static int setup_dev_console(const char *dest, const char *console) {
1089         _cleanup_umask_ mode_t u;
1090         const char *to;
1091         struct stat st;
1092         int r;
1093
1094         assert(dest);
1095         assert(console);
1096
1097         u = umask(0000);
1098
1099         if (stat("/dev/null", &st) < 0) {
1100                 log_error("Failed to stat /dev/null: %m");
1101                 return -errno;
1102         }
1103
1104         r = chmod_and_chown(console, 0600, 0, 0);
1105         if (r < 0) {
1106                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1107                 return r;
1108         }
1109
1110         /* We need to bind mount the right tty to /dev/console since
1111          * ptys can only exist on pts file systems. To have something
1112          * to bind mount things on we create a device node first, and
1113          * use /dev/null for that since we the cgroups device policy
1114          * allows us to create that freely, while we cannot create
1115          * /dev/console. (Note that the major minor doesn't actually
1116          * matter here, since we mount it over anyway). */
1117
1118         to = strappenda(dest, "/dev/console");
1119         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1120                 log_error("mknod() for /dev/console failed: %m");
1121                 return -errno;
1122         }
1123
1124         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1125                 log_error("Bind mount for /dev/console failed: %m");
1126                 return -errno;
1127         }
1128
1129         return 0;
1130 }
1131
1132 static int setup_kmsg(const char *dest, int kmsg_socket) {
1133         _cleanup_free_ char *from = NULL, *to = NULL;
1134         int r, fd, k;
1135         _cleanup_umask_ mode_t u;
1136         union {
1137                 struct cmsghdr cmsghdr;
1138                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1139         } control = {};
1140         struct msghdr mh = {
1141                 .msg_control = &control,
1142                 .msg_controllen = sizeof(control),
1143         };
1144         struct cmsghdr *cmsg;
1145
1146         assert(dest);
1147         assert(kmsg_socket >= 0);
1148
1149         u = umask(0000);
1150
1151         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1152          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1153          * on the reading side behave very similar to /proc/kmsg,
1154          * their writing side behaves differently from /dev/kmsg in
1155          * that writing blocks when nothing is reading. In order to
1156          * avoid any problems with containers deadlocking due to this
1157          * we simply make /dev/kmsg unavailable to the container. */
1158         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1159             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1160                 return log_oom();
1161
1162         if (mkfifo(from, 0600) < 0) {
1163                 log_error("mkfifo() for /dev/kmsg failed: %m");
1164                 return -errno;
1165         }
1166
1167         r = chmod_and_chown(from, 0600, 0, 0);
1168         if (r < 0) {
1169                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1170                 return r;
1171         }
1172
1173         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1174                 log_error("Bind mount for /proc/kmsg failed: %m");
1175                 return -errno;
1176         }
1177
1178         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1179         if (fd < 0) {
1180                 log_error("Failed to open fifo: %m");
1181                 return -errno;
1182         }
1183
1184         cmsg = CMSG_FIRSTHDR(&mh);
1185         cmsg->cmsg_level = SOL_SOCKET;
1186         cmsg->cmsg_type = SCM_RIGHTS;
1187         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1188         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1189
1190         mh.msg_controllen = cmsg->cmsg_len;
1191
1192         /* Store away the fd in the socket, so that it stays open as
1193          * long as we run the child */
1194         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1195         safe_close(fd);
1196
1197         if (k < 0) {
1198                 log_error("Failed to send FIFO fd: %m");
1199                 return -errno;
1200         }
1201
1202         /* And now make the FIFO unavailable as /dev/kmsg... */
1203         unlink(from);
1204         return 0;
1205 }
1206
1207 static int setup_hostname(void) {
1208
1209         if (arg_share_system)
1210                 return 0;
1211
1212         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1213                 return -errno;
1214
1215         return 0;
1216 }
1217
1218 static int setup_journal(const char *directory) {
1219         sd_id128_t machine_id, this_id;
1220         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1221         char *id;
1222         int r;
1223
1224         p = strappend(directory, "/etc/machine-id");
1225         if (!p)
1226                 return log_oom();
1227
1228         r = read_one_line_file(p, &b);
1229         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1230                 return 0;
1231         else if (r < 0) {
1232                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1233                 return r;
1234         }
1235
1236         id = strstrip(b);
1237         if (isempty(id) && arg_link_journal == LINK_AUTO)
1238                 return 0;
1239
1240         /* Verify validity */
1241         r = sd_id128_from_string(id, &machine_id);
1242         if (r < 0) {
1243                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1244                 return r;
1245         }
1246
1247         r = sd_id128_get_machine(&this_id);
1248         if (r < 0) {
1249                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1250                 return r;
1251         }
1252
1253         if (sd_id128_equal(machine_id, this_id)) {
1254                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1255                          "Host and machine ids are equal (%s): refusing to link journals", id);
1256                 if (arg_link_journal == LINK_AUTO)
1257                         return 0;
1258                 return
1259                         -EEXIST;
1260         }
1261
1262         if (arg_link_journal == LINK_NO)
1263                 return 0;
1264
1265         free(p);
1266         p = strappend("/var/log/journal/", id);
1267         q = strjoin(directory, "/var/log/journal/", id, NULL);
1268         if (!p || !q)
1269                 return log_oom();
1270
1271         if (path_is_mount_point(p, false) > 0) {
1272                 if (arg_link_journal != LINK_AUTO) {
1273                         log_error("%s: already a mount point, refusing to use for journal", p);
1274                         return -EEXIST;
1275                 }
1276
1277                 return 0;
1278         }
1279
1280         if (path_is_mount_point(q, false) > 0) {
1281                 if (arg_link_journal != LINK_AUTO) {
1282                         log_error("%s: already a mount point, refusing to use for journal", q);
1283                         return -EEXIST;
1284                 }
1285
1286                 return 0;
1287         }
1288
1289         r = readlink_and_make_absolute(p, &d);
1290         if (r >= 0) {
1291                 if ((arg_link_journal == LINK_GUEST ||
1292                      arg_link_journal == LINK_AUTO) &&
1293                     path_equal(d, q)) {
1294
1295                         r = mkdir_p(q, 0755);
1296                         if (r < 0)
1297                                 log_warning("failed to create directory %s: %m", q);
1298                         return 0;
1299                 }
1300
1301                 if (unlink(p) < 0) {
1302                         log_error("Failed to remove symlink %s: %m", p);
1303                         return -errno;
1304                 }
1305         } else if (r == -EINVAL) {
1306
1307                 if (arg_link_journal == LINK_GUEST &&
1308                     rmdir(p) < 0) {
1309
1310                         if (errno == ENOTDIR) {
1311                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1312                                 return r;
1313                         } else {
1314                                 log_error("Failed to remove %s: %m", p);
1315                                 return -errno;
1316                         }
1317                 }
1318         } else if (r != -ENOENT) {
1319                 log_error("readlink(%s) failed: %m", p);
1320                 return r;
1321         }
1322
1323         if (arg_link_journal == LINK_GUEST) {
1324
1325                 if (symlink(q, p) < 0) {
1326                         log_error("Failed to symlink %s to %s: %m", q, p);
1327                         return -errno;
1328                 }
1329
1330                 r = mkdir_p(q, 0755);
1331                 if (r < 0)
1332                         log_warning("failed to create directory %s: %m", q);
1333                 return 0;
1334         }
1335
1336         if (arg_link_journal == LINK_HOST) {
1337                 r = mkdir_p(p, 0755);
1338                 if (r < 0) {
1339                         log_error("Failed to create %s: %m", p);
1340                         return r;
1341                 }
1342
1343         } else if (access(p, F_OK) < 0)
1344                 return 0;
1345
1346         if (dir_is_empty(q) == 0)
1347                 log_warning("%s is not empty, proceeding anyway.", q);
1348
1349         r = mkdir_p(q, 0755);
1350         if (r < 0) {
1351                 log_error("Failed to create %s: %m", q);
1352                 return r;
1353         }
1354
1355         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1356                 log_error("Failed to bind mount journal from host into guest: %m");
1357                 return -errno;
1358         }
1359
1360         return 0;
1361 }
1362
1363 static int setup_kdbus(const char *dest, const char *path) {
1364         const char *p;
1365
1366         if (!path)
1367                 return 0;
1368
1369         p = strappenda(dest, "/dev/kdbus");
1370         if (mkdir(p, 0755) < 0) {
1371                 log_error("Failed to create kdbus path: %m");
1372                 return  -errno;
1373         }
1374
1375         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1376                 log_error("Failed to mount kdbus domain path: %m");
1377                 return -errno;
1378         }
1379
1380         return 0;
1381 }
1382
1383 static int drop_capabilities(void) {
1384         return capability_bounding_set_drop(~arg_retain, false);
1385 }
1386
1387 static int register_machine(pid_t pid, int local_ifindex) {
1388         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1389         _cleanup_bus_unref_ sd_bus *bus = NULL;
1390         int r;
1391
1392         if (!arg_register)
1393                 return 0;
1394
1395         r = sd_bus_default_system(&bus);
1396         if (r < 0) {
1397                 log_error("Failed to open system bus: %s", strerror(-r));
1398                 return r;
1399         }
1400
1401         if (arg_keep_unit) {
1402                 r = sd_bus_call_method(
1403                                 bus,
1404                                 "org.freedesktop.machine1",
1405                                 "/org/freedesktop/machine1",
1406                                 "org.freedesktop.machine1.Manager",
1407                                 "RegisterMachineWithNetwork",
1408                                 &error,
1409                                 NULL,
1410                                 "sayssusai",
1411                                 arg_machine,
1412                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1413                                 "nspawn",
1414                                 "container",
1415                                 (uint32_t) pid,
1416                                 strempty(arg_directory),
1417                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1418         } else {
1419                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1420
1421                 r = sd_bus_message_new_method_call(
1422                                 bus,
1423                                 &m,
1424                                 "org.freedesktop.machine1",
1425                                 "/org/freedesktop/machine1",
1426                                 "org.freedesktop.machine1.Manager",
1427                                 "CreateMachineWithNetwork");
1428                 if (r < 0) {
1429                         log_error("Failed to create message: %s", strerror(-r));
1430                         return r;
1431                 }
1432
1433                 r = sd_bus_message_append(
1434                                 m,
1435                                 "sayssusai",
1436                                 arg_machine,
1437                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1438                                 "nspawn",
1439                                 "container",
1440                                 (uint32_t) pid,
1441                                 strempty(arg_directory),
1442                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1443                 if (r < 0) {
1444                         log_error("Failed to append message arguments: %s", strerror(-r));
1445                         return r;
1446                 }
1447
1448                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1449                 if (r < 0) {
1450                         log_error("Failed to open container: %s", strerror(-r));
1451                         return r;
1452                 }
1453
1454                 if (!isempty(arg_slice)) {
1455                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1456                         if (r < 0) {
1457                                 log_error("Failed to append slice: %s", strerror(-r));
1458                                 return r;
1459                         }
1460                 }
1461
1462                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1463                 if (r < 0) {
1464                         log_error("Failed to add device policy: %s", strerror(-r));
1465                         return r;
1466                 }
1467
1468                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1469                                           /* Allow the container to
1470                                            * access and create the API
1471                                            * device nodes, so that
1472                                            * PrivateDevices= in the
1473                                            * container can work
1474                                            * fine */
1475                                           "/dev/null", "rwm",
1476                                           "/dev/zero", "rwm",
1477                                           "/dev/full", "rwm",
1478                                           "/dev/random", "rwm",
1479                                           "/dev/urandom", "rwm",
1480                                           "/dev/tty", "rwm",
1481                                           /* Allow the container
1482                                            * access to ptys. However,
1483                                            * do not permit the
1484                                            * container to ever create
1485                                            * these device nodes. */
1486                                           "/dev/pts/ptmx", "rw",
1487                                           "char-pts", "rw",
1488                                           /* Allow the container
1489                                            * access to all kdbus
1490                                            * devices. Again, the
1491                                            * container cannot create
1492                                            * these nodes, only use
1493                                            * them. We use a pretty
1494                                            * open match here, so that
1495                                            * the kernel API can still
1496                                            * change. */
1497                                           "char-kdbus", "rw",
1498                                           "char-kdbus/*", "rw");
1499                 if (r < 0) {
1500                         log_error("Failed to add device whitelist: %s", strerror(-r));
1501                         return r;
1502                 }
1503
1504                 r = sd_bus_message_close_container(m);
1505                 if (r < 0) {
1506                         log_error("Failed to close container: %s", strerror(-r));
1507                         return r;
1508                 }
1509
1510                 r = sd_bus_call(bus, m, 0, &error, NULL);
1511         }
1512
1513         if (r < 0) {
1514                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1515                 return r;
1516         }
1517
1518         return 0;
1519 }
1520
1521 static int terminate_machine(pid_t pid) {
1522         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1523         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1524         _cleanup_bus_unref_ sd_bus *bus = NULL;
1525         const char *path;
1526         int r;
1527
1528         if (!arg_register)
1529                 return 0;
1530
1531         r = sd_bus_default_system(&bus);
1532         if (r < 0) {
1533                 log_error("Failed to open system bus: %s", strerror(-r));
1534                 return r;
1535         }
1536
1537         r = sd_bus_call_method(
1538                         bus,
1539                         "org.freedesktop.machine1",
1540                         "/org/freedesktop/machine1",
1541                         "org.freedesktop.machine1.Manager",
1542                         "GetMachineByPID",
1543                         &error,
1544                         &reply,
1545                         "u",
1546                         (uint32_t) pid);
1547         if (r < 0) {
1548                 /* Note that the machine might already have been
1549                  * cleaned up automatically, hence don't consider it a
1550                  * failure if we cannot get the machine object. */
1551                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1552                 return 0;
1553         }
1554
1555         r = sd_bus_message_read(reply, "o", &path);
1556         if (r < 0)
1557                 return bus_log_parse_error(r);
1558
1559         r = sd_bus_call_method(
1560                         bus,
1561                         "org.freedesktop.machine1",
1562                         path,
1563                         "org.freedesktop.machine1.Machine",
1564                         "Terminate",
1565                         &error,
1566                         NULL,
1567                         NULL);
1568         if (r < 0) {
1569                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1570                 return 0;
1571         }
1572
1573         return 0;
1574 }
1575
1576 static int reset_audit_loginuid(void) {
1577         _cleanup_free_ char *p = NULL;
1578         int r;
1579
1580         if (arg_share_system)
1581                 return 0;
1582
1583         r = read_one_line_file("/proc/self/loginuid", &p);
1584         if (r == -ENOENT)
1585                 return 0;
1586         if (r < 0) {
1587                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1588                 return r;
1589         }
1590
1591         /* Already reset? */
1592         if (streq(p, "4294967295"))
1593                 return 0;
1594
1595         r = write_string_file("/proc/self/loginuid", "4294967295");
1596         if (r < 0) {
1597                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1598                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1599                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1600                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1601                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1602
1603                 sleep(5);
1604         }
1605
1606         return 0;
1607 }
1608
1609 #define HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1610
1611 static int get_mac(struct ether_addr *mac) {
1612         int r;
1613
1614         uint8_t result[8];
1615         size_t l, sz;
1616         uint8_t *v;
1617
1618         l = strlen(arg_machine);
1619         sz = sizeof(sd_id128_t) + l;
1620         v = alloca(sz);
1621
1622         /* fetch some persistent data unique to the host */
1623         r = sd_id128_get_machine((sd_id128_t*) v);
1624         if (r < 0)
1625                 return r;
1626
1627         /* combine with some data unique (on this host) to this
1628          * container instance */
1629         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1630
1631         /* Let's hash the host machine ID plus the container name. We
1632          * use a fixed, but originally randomly created hash key here. */
1633         siphash24(result, v, sz, HASH_KEY.bytes);
1634
1635         assert_cc(ETH_ALEN <= sizeof(result));
1636         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1637
1638         /* see eth_random_addr in the kernel */
1639         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1640         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1641
1642         return 0;
1643 }
1644
1645 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1646         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1647         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1648         struct ether_addr mac;
1649         int r, i;
1650
1651         if (!arg_private_network)
1652                 return 0;
1653
1654         if (!arg_network_veth)
1655                 return 0;
1656
1657         /* Use two different interface name prefixes depending whether
1658          * we are in bridge mode or not. */
1659         snprintf(iface_name, IFNAMSIZ, "%s-%s",
1660                  arg_network_bridge ? "vb" : "ve", arg_machine);
1661
1662         r = get_mac(&mac);
1663         if (r < 0) {
1664                 log_error("Failed to generate predictable MAC address for host0");
1665                 return r;
1666         }
1667
1668         r = sd_rtnl_open(&rtnl, 0);
1669         if (r < 0) {
1670                 log_error("Failed to connect to netlink: %s", strerror(-r));
1671                 return r;
1672         }
1673
1674         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1675         if (r < 0) {
1676                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1677                 return r;
1678         }
1679
1680         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1681         if (r < 0) {
1682                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1683                 return r;
1684         }
1685
1686         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1687         if (r < 0) {
1688                 log_error("Failed to open netlink container: %s", strerror(-r));
1689                 return r;
1690         }
1691
1692         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1693         if (r < 0) {
1694                 log_error("Failed to open netlink container: %s", strerror(-r));
1695                 return r;
1696         }
1697
1698         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1699         if (r < 0) {
1700                 log_error("Failed to open netlink container: %s", strerror(-r));
1701                 return r;
1702         }
1703
1704         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1705         if (r < 0) {
1706                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1707                 return r;
1708         }
1709
1710         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1711         if (r < 0) {
1712                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1713                 return r;
1714         }
1715
1716         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1717         if (r < 0) {
1718                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1719                 return r;
1720         }
1721
1722         r = sd_rtnl_message_close_container(m);
1723         if (r < 0) {
1724                 log_error("Failed to close netlink container: %s", strerror(-r));
1725                 return r;
1726         }
1727
1728         r = sd_rtnl_message_close_container(m);
1729         if (r < 0) {
1730                 log_error("Failed to close netlink container: %s", strerror(-r));
1731                 return r;
1732         }
1733
1734         r = sd_rtnl_message_close_container(m);
1735         if (r < 0) {
1736                 log_error("Failed to close netlink container: %s", strerror(-r));
1737                 return r;
1738         }
1739
1740         r = sd_rtnl_call(rtnl, m, 0, NULL);
1741         if (r < 0) {
1742                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1743                 return r;
1744         }
1745
1746         i = (int) if_nametoindex(iface_name);
1747         if (i <= 0) {
1748                 log_error("Failed to resolve interface %s: %m", iface_name);
1749                 return -errno;
1750         }
1751
1752         *ifi = i;
1753
1754         return 0;
1755 }
1756
1757 static int setup_bridge(const char veth_name[], int *ifi) {
1758         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1759         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1760         int r, bridge;
1761
1762         if (!arg_private_network)
1763                 return 0;
1764
1765         if (!arg_network_veth)
1766                 return 0;
1767
1768         if (!arg_network_bridge)
1769                 return 0;
1770
1771         bridge = (int) if_nametoindex(arg_network_bridge);
1772         if (bridge <= 0) {
1773                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1774                 return -errno;
1775         }
1776
1777         *ifi = bridge;
1778
1779         r = sd_rtnl_open(&rtnl, 0);
1780         if (r < 0) {
1781                 log_error("Failed to connect to netlink: %s", strerror(-r));
1782                 return r;
1783         }
1784
1785         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1786         if (r < 0) {
1787                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1788                 return r;
1789         }
1790
1791         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1792         if (r < 0) {
1793                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1794                 return r;
1795         }
1796
1797         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1798         if (r < 0) {
1799                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1800                 return r;
1801         }
1802
1803         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1804         if (r < 0) {
1805                 log_error("Failed to add netlink master field: %s", strerror(-r));
1806                 return r;
1807         }
1808
1809         r = sd_rtnl_call(rtnl, m, 0, NULL);
1810         if (r < 0) {
1811                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1812                 return r;
1813         }
1814
1815         return 0;
1816 }
1817
1818 static int parse_interface(struct udev *udev, const char *name) {
1819         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1820         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1821         int ifi;
1822
1823         ifi = (int) if_nametoindex(name);
1824         if (ifi <= 0) {
1825                 log_error("Failed to resolve interface %s: %m", name);
1826                 return -errno;
1827         }
1828
1829         sprintf(ifi_str, "n%i", ifi);
1830         d = udev_device_new_from_device_id(udev, ifi_str);
1831         if (!d) {
1832                 log_error("Failed to get udev device for interface %s: %m", name);
1833                 return -errno;
1834         }
1835
1836         if (udev_device_get_is_initialized(d) <= 0) {
1837                 log_error("Network interface %s is not initialized yet.", name);
1838                 return -EBUSY;
1839         }
1840
1841         return ifi;
1842 }
1843
1844 static int move_network_interfaces(pid_t pid) {
1845         _cleanup_udev_unref_ struct udev *udev = NULL;
1846         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1847         char **i;
1848         int r;
1849
1850         if (!arg_private_network)
1851                 return 0;
1852
1853         if (strv_isempty(arg_network_interfaces))
1854                 return 0;
1855
1856         r = sd_rtnl_open(&rtnl, 0);
1857         if (r < 0) {
1858                 log_error("Failed to connect to netlink: %s", strerror(-r));
1859                 return r;
1860         }
1861
1862         udev = udev_new();
1863         if (!udev) {
1864                 log_error("Failed to connect to udev.");
1865                 return -ENOMEM;
1866         }
1867
1868         STRV_FOREACH(i, arg_network_interfaces) {
1869                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1870                 int ifi;
1871
1872                 ifi = parse_interface(udev, *i);
1873                 if (ifi < 0)
1874                         return ifi;
1875
1876                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1877                 if (r < 0) {
1878                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1879                         return r;
1880                 }
1881
1882                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1883                 if (r < 0) {
1884                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1885                         return r;
1886                 }
1887
1888                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1889                 if (r < 0) {
1890                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1891                         return r;
1892                 }
1893         }
1894
1895         return 0;
1896 }
1897
1898 static int setup_macvlan(pid_t pid) {
1899         _cleanup_udev_unref_ struct udev *udev = NULL;
1900         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1901         char **i;
1902         int r;
1903
1904         if (!arg_private_network)
1905                 return 0;
1906
1907         if (strv_isempty(arg_network_macvlan))
1908                 return 0;
1909
1910         r = sd_rtnl_open(&rtnl, 0);
1911         if (r < 0) {
1912                 log_error("Failed to connect to netlink: %s", strerror(-r));
1913                 return r;
1914         }
1915
1916         udev = udev_new();
1917         if (!udev) {
1918                 log_error("Failed to connect to udev.");
1919                 return -ENOMEM;
1920         }
1921
1922         STRV_FOREACH(i, arg_network_macvlan) {
1923                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1924                 _cleanup_free_ char *n = NULL;
1925                 int ifi;
1926
1927                 ifi = parse_interface(udev, *i);
1928                 if (ifi < 0)
1929                         return ifi;
1930
1931                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1932                 if (r < 0) {
1933                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1934                         return r;
1935                 }
1936
1937                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1938                 if (r < 0) {
1939                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1940                         return r;
1941                 }
1942
1943                 n = strappend("mv-", *i);
1944                 if (!n)
1945                         return log_oom();
1946
1947                 strshorten(n, IFNAMSIZ-1);
1948
1949                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1950                 if (r < 0) {
1951                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1952                         return r;
1953                 }
1954
1955                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1956                 if (r < 0) {
1957                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1958                         return r;
1959                 }
1960
1961                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1962                 if (r < 0) {
1963                         log_error("Failed to open netlink container: %s", strerror(-r));
1964                         return r;
1965                 }
1966
1967                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1968                 if (r < 0) {
1969                         log_error("Failed to open netlink container: %s", strerror(-r));
1970                         return r;
1971                 }
1972
1973                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1974                 if (r < 0) {
1975                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1976                         return r;
1977                 }
1978
1979                 r = sd_rtnl_message_close_container(m);
1980                 if (r < 0) {
1981                         log_error("Failed to close netlink container: %s", strerror(-r));
1982                         return r;
1983                 }
1984
1985                 r = sd_rtnl_message_close_container(m);
1986                 if (r < 0) {
1987                         log_error("Failed to close netlink container: %s", strerror(-r));
1988                         return r;
1989                 }
1990
1991                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1992                 if (r < 0) {
1993                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1994                         return r;
1995                 }
1996         }
1997
1998         return 0;
1999 }
2000
2001 static int setup_seccomp(void) {
2002
2003 #ifdef HAVE_SECCOMP
2004         static const int blacklist[] = {
2005                 SCMP_SYS(kexec_load),
2006                 SCMP_SYS(open_by_handle_at),
2007                 SCMP_SYS(init_module),
2008                 SCMP_SYS(finit_module),
2009                 SCMP_SYS(delete_module),
2010                 SCMP_SYS(iopl),
2011                 SCMP_SYS(ioperm),
2012                 SCMP_SYS(swapon),
2013                 SCMP_SYS(swapoff),
2014         };
2015
2016         scmp_filter_ctx seccomp;
2017         unsigned i;
2018         int r;
2019
2020         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2021         if (!seccomp)
2022                 return log_oom();
2023
2024         r = seccomp_add_secondary_archs(seccomp);
2025         if (r < 0) {
2026                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2027                 goto finish;
2028         }
2029
2030         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2031                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2032                 if (r == -EFAULT)
2033                         continue; /* unknown syscall */
2034                 if (r < 0) {
2035                         log_error("Failed to block syscall: %s", strerror(-r));
2036                         goto finish;
2037                 }
2038         }
2039
2040         /*
2041            Audit is broken in containers, much of the userspace audit
2042            hookup will fail if running inside a container. We don't
2043            care and just turn off creation of audit sockets.
2044
2045            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2046            with EAFNOSUPPORT which audit userspace uses as indication
2047            that audit is disabled in the kernel.
2048          */
2049
2050         r = seccomp_rule_add(
2051                         seccomp,
2052                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2053                         SCMP_SYS(socket),
2054                         2,
2055                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2056                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2057         if (r < 0) {
2058                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2059                 goto finish;
2060         }
2061
2062         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2063         if (r < 0) {
2064                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2065                 goto finish;
2066         }
2067
2068         r = seccomp_load(seccomp);
2069         if (r < 0)
2070                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2071
2072 finish:
2073         seccomp_release(seccomp);
2074         return r;
2075 #else
2076         return 0;
2077 #endif
2078
2079 }
2080
2081 static int setup_image(char **device_path, int *loop_nr) {
2082         struct loop_info64 info = {
2083                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2084         };
2085         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2086         _cleanup_free_ char* loopdev = NULL;
2087         struct stat st;
2088         int r, nr;
2089
2090         assert(device_path);
2091         assert(loop_nr);
2092
2093         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2094         if (fd < 0) {
2095                 log_error("Failed to open %s: %m", arg_image);
2096                 return -errno;
2097         }
2098
2099         if (fstat(fd, &st) < 0) {
2100                 log_error("Failed to stat %s: %m", arg_image);
2101                 return -errno;
2102         }
2103
2104         if (S_ISBLK(st.st_mode)) {
2105                 char *p;
2106
2107                 p = strdup(arg_image);
2108                 if (!p)
2109                         return log_oom();
2110
2111                 *device_path = p;
2112
2113                 *loop_nr = -1;
2114
2115                 r = fd;
2116                 fd = -1;
2117
2118                 return r;
2119         }
2120
2121         if (!S_ISREG(st.st_mode)) {
2122                 log_error("%s is not a regular file or block device: %m", arg_image);
2123                 return -EINVAL;
2124         }
2125
2126         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2127         if (control < 0) {
2128                 log_error("Failed to open /dev/loop-control: %m");
2129                 return -errno;
2130         }
2131
2132         nr = ioctl(control, LOOP_CTL_GET_FREE);
2133         if (nr < 0) {
2134                 log_error("Failed to allocate loop device: %m");
2135                 return -errno;
2136         }
2137
2138         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2139                 return log_oom();
2140
2141         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2142         if (loop < 0) {
2143                 log_error("Failed to open loop device %s: %m", loopdev);
2144                 return -errno;
2145         }
2146
2147         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2148                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2149                 return -errno;
2150         }
2151
2152         if (arg_read_only)
2153                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2154
2155         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2156                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2157                 return -errno;
2158         }
2159
2160         *device_path = loopdev;
2161         loopdev = NULL;
2162
2163         *loop_nr = nr;
2164
2165         r = loop;
2166         loop = -1;
2167
2168         return r;
2169 }
2170
2171 static int dissect_image(
2172                 int fd,
2173                 char **root_device, bool *root_device_rw,
2174                 char **home_device, bool *home_device_rw,
2175                 char **srv_device, bool *srv_device_rw,
2176                 bool *secondary) {
2177
2178 #ifdef HAVE_BLKID
2179         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2180         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2181         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2182         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2183         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2184         _cleanup_udev_unref_ struct udev *udev = NULL;
2185         struct udev_list_entry *first, *item;
2186         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2187         const char *pttype = NULL;
2188         blkid_partlist pl;
2189         struct stat st;
2190         int r;
2191
2192         assert(fd >= 0);
2193         assert(root_device);
2194         assert(home_device);
2195         assert(srv_device);
2196         assert(secondary);
2197
2198         b = blkid_new_probe();
2199         if (!b)
2200                 return log_oom();
2201
2202         errno = 0;
2203         r = blkid_probe_set_device(b, fd, 0, 0);
2204         if (r != 0) {
2205                 if (errno == 0)
2206                         return log_oom();
2207
2208                 log_error("Failed to set device on blkid probe: %m");
2209                 return -errno;
2210         }
2211
2212         blkid_probe_enable_partitions(b, 1);
2213         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2214
2215         errno = 0;
2216         r = blkid_do_safeprobe(b);
2217         if (r == -2 || r == 1) {
2218                 log_error("Failed to identify any partition table on %s.\n"
2219                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2220                 return -EINVAL;
2221         } else if (r != 0) {
2222                 if (errno == 0)
2223                         errno = EIO;
2224                 log_error("Failed to probe: %m");
2225                 return -errno;
2226         }
2227
2228         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2229         if (!streq_ptr(pttype, "gpt")) {
2230                 log_error("Image %s does not carry a GUID Partition Table.\n"
2231                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2232                 return -EINVAL;
2233         }
2234
2235         errno = 0;
2236         pl = blkid_probe_get_partitions(b);
2237         if (!pl) {
2238                 if (errno == 0)
2239                         return log_oom();
2240
2241                 log_error("Failed to list partitions of %s", arg_image);
2242                 return -errno;
2243         }
2244
2245         udev = udev_new();
2246         if (!udev)
2247                 return log_oom();
2248
2249         if (fstat(fd, &st) < 0) {
2250                 log_error("Failed to stat block device: %m");
2251                 return -errno;
2252         }
2253
2254         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2255         if (!d)
2256                 return log_oom();
2257
2258         e = udev_enumerate_new(udev);
2259         if (!e)
2260                 return log_oom();
2261
2262         r = udev_enumerate_add_match_parent(e, d);
2263         if (r < 0)
2264                 return log_oom();
2265
2266         r = udev_enumerate_scan_devices(e);
2267         if (r < 0) {
2268                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2269                 return r;
2270         }
2271
2272         first = udev_enumerate_get_list_entry(e);
2273         udev_list_entry_foreach(item, first) {
2274                 _cleanup_udev_device_unref_ struct udev_device *q;
2275                 const char *stype, *node;
2276                 unsigned long long flags;
2277                 sd_id128_t type_id;
2278                 blkid_partition pp;
2279                 dev_t qn;
2280                 int nr;
2281
2282                 errno = 0;
2283                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2284                 if (!q) {
2285                         if (!errno)
2286                                 errno = ENOMEM;
2287
2288                         log_error("Failed to get partition device of %s: %m", arg_image);
2289                         return -errno;
2290                 }
2291
2292                 qn = udev_device_get_devnum(q);
2293                 if (major(qn) == 0)
2294                         continue;
2295
2296                 if (st.st_rdev == qn)
2297                         continue;
2298
2299                 node = udev_device_get_devnode(q);
2300                 if (!node)
2301                         continue;
2302
2303                 pp = blkid_partlist_devno_to_partition(pl, qn);
2304                 if (!pp)
2305                         continue;
2306
2307                 flags = blkid_partition_get_flags(pp);
2308                 if (flags & GPT_FLAG_NO_AUTO)
2309                         continue;
2310
2311                 nr = blkid_partition_get_partno(pp);
2312                 if (nr < 0)
2313                         continue;
2314
2315                 stype = blkid_partition_get_type_string(pp);
2316                 if (!stype)
2317                         continue;
2318
2319                 if (sd_id128_from_string(stype, &type_id) < 0)
2320                         continue;
2321
2322                 if (sd_id128_equal(type_id, GPT_HOME)) {
2323
2324                         if (home && nr >= home_nr)
2325                                 continue;
2326
2327                         home_nr = nr;
2328                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2329
2330                         free(home);
2331                         home = strdup(node);
2332                         if (!home)
2333                                 return log_oom();
2334                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2335
2336                         if (srv && nr >= srv_nr)
2337                                 continue;
2338
2339                         srv_nr = nr;
2340                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2341
2342                         free(srv);
2343                         srv = strdup(node);
2344                         if (!srv)
2345                                 return log_oom();
2346                 }
2347 #ifdef GPT_ROOT_NATIVE
2348                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2349
2350                         if (root && nr >= root_nr)
2351                                 continue;
2352
2353                         root_nr = nr;
2354                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2355
2356                         free(root);
2357                         root = strdup(node);
2358                         if (!root)
2359                                 return log_oom();
2360                 }
2361 #endif
2362 #ifdef GPT_ROOT_SECONDARY
2363                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2364
2365                         if (secondary_root && nr >= secondary_root_nr)
2366                                 continue;
2367
2368                         secondary_root_nr = nr;
2369                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2370
2371
2372                         free(secondary_root);
2373                         secondary_root = strdup(node);
2374                         if (!secondary_root)
2375                                 return log_oom();
2376                 }
2377 #endif
2378         }
2379
2380         if (!root && !secondary_root) {
2381                 log_error("Failed to identify root partition in disk image %s.\n"
2382                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2383                 return -EINVAL;
2384         }
2385
2386         if (root) {
2387                 *root_device = root;
2388                 root = NULL;
2389
2390                 *root_device_rw = root_rw;
2391                 *secondary = false;
2392         } else if (secondary_root) {
2393                 *root_device = secondary_root;
2394                 secondary_root = NULL;
2395
2396                 *root_device_rw = secondary_root_rw;
2397                 *secondary = true;
2398         }
2399
2400         if (home) {
2401                 *home_device = home;
2402                 home = NULL;
2403
2404                 *home_device_rw = home_rw;
2405         }
2406
2407         if (srv) {
2408                 *srv_device = srv;
2409                 srv = NULL;
2410
2411                 *srv_device_rw = srv_rw;
2412         }
2413
2414         return 0;
2415 #else
2416         log_error("--image= is not supported, compiled without blkid support.");
2417         return -ENOTSUP;
2418 #endif
2419 }
2420
2421 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2422 #ifdef HAVE_BLKID
2423         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2424         const char *fstype, *p;
2425         int r;
2426
2427         assert(what);
2428         assert(where);
2429
2430         if (arg_read_only)
2431                 rw = false;
2432
2433         if (directory)
2434                 p = strappenda(where, directory);
2435         else
2436                 p = where;
2437
2438         errno = 0;
2439         b = blkid_new_probe_from_filename(what);
2440         if (!b) {
2441                 if (errno == 0)
2442                         return log_oom();
2443                 log_error("Failed to allocate prober for %s: %m", what);
2444                 return -errno;
2445         }
2446
2447         blkid_probe_enable_superblocks(b, 1);
2448         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2449
2450         errno = 0;
2451         r = blkid_do_safeprobe(b);
2452         if (r == -1 || r == 1) {
2453                 log_error("Cannot determine file system type of %s", what);
2454                 return -EINVAL;
2455         } else if (r != 0) {
2456                 if (errno == 0)
2457                         errno = EIO;
2458                 log_error("Failed to probe %s: %m", what);
2459                 return -errno;
2460         }
2461
2462         errno = 0;
2463         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2464                 if (errno == 0)
2465                         errno = EINVAL;
2466                 log_error("Failed to determine file system type of %s", what);
2467                 return -errno;
2468         }
2469
2470         if (streq(fstype, "crypto_LUKS")) {
2471                 log_error("nspawn currently does not support LUKS disk images.");
2472                 return -ENOTSUP;
2473         }
2474
2475         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2476                 log_error("Failed to mount %s: %m", what);
2477                 return -errno;
2478         }
2479
2480         return 0;
2481 #else
2482         log_error("--image= is not supported, compiled without blkid support.");
2483         return -ENOTSUP;
2484 #endif
2485 }
2486
2487 static int mount_devices(
2488                 const char *where,
2489                 const char *root_device, bool root_device_rw,
2490                 const char *home_device, bool home_device_rw,
2491                 const char *srv_device, bool srv_device_rw) {
2492         int r;
2493
2494         assert(where);
2495
2496         if (root_device) {
2497                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2498                 if (r < 0) {
2499                         log_error("Failed to mount root directory: %s", strerror(-r));
2500                         return r;
2501                 }
2502         }
2503
2504         if (home_device) {
2505                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2506                 if (r < 0) {
2507                         log_error("Failed to mount home directory: %s", strerror(-r));
2508                         return r;
2509                 }
2510         }
2511
2512         if (srv_device) {
2513                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2514                 if (r < 0) {
2515                         log_error("Failed to mount server data directory: %s", strerror(-r));
2516                         return r;
2517                 }
2518         }
2519
2520         return 0;
2521 }
2522
2523 static void loop_remove(int nr, int *image_fd) {
2524         _cleanup_close_ int control = -1;
2525
2526         if (nr < 0)
2527                 return;
2528
2529         if (image_fd && *image_fd >= 0) {
2530                 ioctl(*image_fd, LOOP_CLR_FD);
2531                 *image_fd = safe_close(*image_fd);
2532         }
2533
2534         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2535         if (control < 0)
2536                 return;
2537
2538         ioctl(control, LOOP_CTL_REMOVE, nr);
2539 }
2540
2541 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2542         int pipe_fds[2];
2543         pid_t pid;
2544
2545         assert(database);
2546         assert(key);
2547         assert(rpid);
2548
2549         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2550                 log_error("Failed to allocate pipe: %m");
2551                 return -errno;
2552         }
2553
2554         pid = fork();
2555         if (pid < 0) {
2556                 log_error("Failed to fork getent child: %m");
2557                 return -errno;
2558         } else if (pid == 0) {
2559                 int nullfd;
2560                 char *empty_env = NULL;
2561
2562                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2563                         _exit(EXIT_FAILURE);
2564
2565                 if (pipe_fds[0] > 2)
2566                         safe_close(pipe_fds[0]);
2567                 if (pipe_fds[1] > 2)
2568                         safe_close(pipe_fds[1]);
2569
2570                 nullfd = open("/dev/null", O_RDWR);
2571                 if (nullfd < 0)
2572                         _exit(EXIT_FAILURE);
2573
2574                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2575                         _exit(EXIT_FAILURE);
2576
2577                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2578                         _exit(EXIT_FAILURE);
2579
2580                 if (nullfd > 2)
2581                         safe_close(nullfd);
2582
2583                 reset_all_signal_handlers();
2584                 close_all_fds(NULL, 0);
2585
2586                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2587                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2588                 _exit(EXIT_FAILURE);
2589         }
2590
2591         pipe_fds[1] = safe_close(pipe_fds[1]);
2592
2593         *rpid = pid;
2594
2595         return pipe_fds[0];
2596 }
2597
2598 static int change_uid_gid(char **_home) {
2599         char line[LINE_MAX], *x, *u, *g, *h;
2600         const char *word, *state;
2601         _cleanup_free_ uid_t *uids = NULL;
2602         _cleanup_free_ char *home = NULL;
2603         _cleanup_fclose_ FILE *f = NULL;
2604         _cleanup_close_ int fd = -1;
2605         unsigned n_uids = 0;
2606         size_t sz = 0, l;
2607         uid_t uid;
2608         gid_t gid;
2609         pid_t pid;
2610         int r;
2611
2612         assert(_home);
2613
2614         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2615                 /* Reset everything fully to 0, just in case */
2616
2617                 if (setgroups(0, NULL) < 0) {
2618                         log_error("setgroups() failed: %m");
2619                         return -errno;
2620                 }
2621
2622                 if (setresgid(0, 0, 0) < 0) {
2623                         log_error("setregid() failed: %m");
2624                         return -errno;
2625                 }
2626
2627                 if (setresuid(0, 0, 0) < 0) {
2628                         log_error("setreuid() failed: %m");
2629                         return -errno;
2630                 }
2631
2632                 *_home = NULL;
2633                 return 0;
2634         }
2635
2636         /* First, get user credentials */
2637         fd = spawn_getent("passwd", arg_user, &pid);
2638         if (fd < 0)
2639                 return fd;
2640
2641         f = fdopen(fd, "r");
2642         if (!f)
2643                 return log_oom();
2644         fd = -1;
2645
2646         if (!fgets(line, sizeof(line), f)) {
2647
2648                 if (!ferror(f)) {
2649                         log_error("Failed to resolve user %s.", arg_user);
2650                         return -ESRCH;
2651                 }
2652
2653                 log_error("Failed to read from getent: %m");
2654                 return -errno;
2655         }
2656
2657         truncate_nl(line);
2658
2659         wait_for_terminate_and_warn("getent passwd", pid);
2660
2661         x = strchr(line, ':');
2662         if (!x) {
2663                 log_error("/etc/passwd entry has invalid user field.");
2664                 return -EIO;
2665         }
2666
2667         u = strchr(x+1, ':');
2668         if (!u) {
2669                 log_error("/etc/passwd entry has invalid password field.");
2670                 return -EIO;
2671         }
2672
2673         u++;
2674         g = strchr(u, ':');
2675         if (!g) {
2676                 log_error("/etc/passwd entry has invalid UID field.");
2677                 return -EIO;
2678         }
2679
2680         *g = 0;
2681         g++;
2682         x = strchr(g, ':');
2683         if (!x) {
2684                 log_error("/etc/passwd entry has invalid GID field.");
2685                 return -EIO;
2686         }
2687
2688         *x = 0;
2689         h = strchr(x+1, ':');
2690         if (!h) {
2691                 log_error("/etc/passwd entry has invalid GECOS field.");
2692                 return -EIO;
2693         }
2694
2695         h++;
2696         x = strchr(h, ':');
2697         if (!x) {
2698                 log_error("/etc/passwd entry has invalid home directory field.");
2699                 return -EIO;
2700         }
2701
2702         *x = 0;
2703
2704         r = parse_uid(u, &uid);
2705         if (r < 0) {
2706                 log_error("Failed to parse UID of user.");
2707                 return -EIO;
2708         }
2709
2710         r = parse_gid(g, &gid);
2711         if (r < 0) {
2712                 log_error("Failed to parse GID of user.");
2713                 return -EIO;
2714         }
2715
2716         home = strdup(h);
2717         if (!home)
2718                 return log_oom();
2719
2720         /* Second, get group memberships */
2721         fd = spawn_getent("initgroups", arg_user, &pid);
2722         if (fd < 0)
2723                 return fd;
2724
2725         fclose(f);
2726         f = fdopen(fd, "r");
2727         if (!f)
2728                 return log_oom();
2729         fd = -1;
2730
2731         if (!fgets(line, sizeof(line), f)) {
2732                 if (!ferror(f)) {
2733                         log_error("Failed to resolve user %s.", arg_user);
2734                         return -ESRCH;
2735                 }
2736
2737                 log_error("Failed to read from getent: %m");
2738                 return -errno;
2739         }
2740
2741         truncate_nl(line);
2742
2743         wait_for_terminate_and_warn("getent initgroups", pid);
2744
2745         /* Skip over the username and subsequent separator whitespace */
2746         x = line;
2747         x += strcspn(x, WHITESPACE);
2748         x += strspn(x, WHITESPACE);
2749
2750         FOREACH_WORD(word, l, x, state) {
2751                 char c[l+1];
2752
2753                 memcpy(c, word, l);
2754                 c[l] = 0;
2755
2756                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2757                         return log_oom();
2758
2759                 r = parse_uid(c, &uids[n_uids++]);
2760                 if (r < 0) {
2761                         log_error("Failed to parse group data from getent.");
2762                         return -EIO;
2763                 }
2764         }
2765
2766         r = mkdir_parents(home, 0775);
2767         if (r < 0) {
2768                 log_error("Failed to make home root directory: %s", strerror(-r));
2769                 return r;
2770         }
2771
2772         r = mkdir_safe(home, 0755, uid, gid);
2773         if (r < 0 && r != -EEXIST) {
2774                 log_error("Failed to make home directory: %s", strerror(-r));
2775                 return r;
2776         }
2777
2778         fchown(STDIN_FILENO, uid, gid);
2779         fchown(STDOUT_FILENO, uid, gid);
2780         fchown(STDERR_FILENO, uid, gid);
2781
2782         if (setgroups(n_uids, uids) < 0) {
2783                 log_error("Failed to set auxiliary groups: %m");
2784                 return -errno;
2785         }
2786
2787         if (setresgid(gid, gid, gid) < 0) {
2788                 log_error("setregid() failed: %m");
2789                 return -errno;
2790         }
2791
2792         if (setresuid(uid, uid, uid) < 0) {
2793                 log_error("setreuid() failed: %m");
2794                 return -errno;
2795         }
2796
2797         if (_home) {
2798                 *_home = home;
2799                 home = NULL;
2800         }
2801
2802         return 0;
2803 }
2804
2805 /*
2806  * Return values:
2807  * < 0 : wait_for_terminate() failed to get the state of the
2808  *       container, the container was terminated by a signal, or
2809  *       failed for an unknown reason.  No change is made to the
2810  *       container argument.
2811  * > 0 : The program executed in the container terminated with an
2812  *       error.  The exit code of the program executed in the
2813  *       container is returned.  No change is made to the container
2814  *       argument.
2815  *   0 : The container is being rebooted, has been shut down or exited
2816  *       successfully.  The container argument has been set to either
2817  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2818  *
2819  * That is, success is indicated by a return value of zero, and an
2820  * error is indicated by a non-zero value.
2821  */
2822 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2823         int r;
2824         siginfo_t status;
2825
2826         r = wait_for_terminate(pid, &status);
2827         if (r < 0) {
2828                 log_warning("Failed to wait for container: %s", strerror(-r));
2829                 return r;
2830         }
2831
2832         switch (status.si_code) {
2833         case CLD_EXITED:
2834                 r = status.si_status;
2835                 if (r == 0) {
2836                         if (!arg_quiet)
2837                                 log_debug("Container %s exited successfully.",
2838                                           arg_machine);
2839
2840                         *container = CONTAINER_TERMINATED;
2841                 } else {
2842                         log_error("Container %s failed with error code %i.",
2843                                   arg_machine, status.si_status);
2844                 }
2845                 break;
2846
2847         case CLD_KILLED:
2848                 if (status.si_status == SIGINT) {
2849                         if (!arg_quiet)
2850                                 log_info("Container %s has been shut down.",
2851                                          arg_machine);
2852
2853                         *container = CONTAINER_TERMINATED;
2854                         r = 0;
2855                         break;
2856                 } else if (status.si_status == SIGHUP) {
2857                         if (!arg_quiet)
2858                                 log_info("Container %s is being rebooted.",
2859                                          arg_machine);
2860
2861                         *container = CONTAINER_REBOOTED;
2862                         r = 0;
2863                         break;
2864                 }
2865                 /* CLD_KILLED fallthrough */
2866
2867         case CLD_DUMPED:
2868                 log_error("Container %s terminated by signal %s.",
2869                           arg_machine, signal_to_string(status.si_status));
2870                 r = -1;
2871                 break;
2872
2873         default:
2874                 log_error("Container %s failed due to unknown reason.",
2875                           arg_machine);
2876                 r = -1;
2877                 break;
2878         }
2879
2880         return r;
2881 }
2882
2883 static void nop_handler(int sig) {}
2884
2885 int main(int argc, char *argv[]) {
2886
2887         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2888         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2889         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2890         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2891         _cleanup_fdset_free_ FDSet *fds = NULL;
2892         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2893         const char *console = NULL;
2894         char veth_name[IFNAMSIZ];
2895         bool secondary = false;
2896         sigset_t mask, mask_chld;
2897         pid_t pid = 0;
2898
2899         log_parse_environment();
2900         log_open();
2901
2902         k = parse_argv(argc, argv);
2903         if (k < 0)
2904                 goto finish;
2905         else if (k == 0) {
2906                 r = EXIT_SUCCESS;
2907                 goto finish;
2908         }
2909
2910         if (!arg_image) {
2911                 if (arg_directory) {
2912                         char *p;
2913
2914                         p = path_make_absolute_cwd(arg_directory);
2915                         free(arg_directory);
2916                         arg_directory = p;
2917                 } else
2918                         arg_directory = get_current_dir_name();
2919
2920                 if (!arg_directory) {
2921                         log_error("Failed to determine path, please use -D.");
2922                         goto finish;
2923                 }
2924                 path_kill_slashes(arg_directory);
2925         }
2926
2927         if (!arg_machine) {
2928                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2929                 if (!arg_machine) {
2930                         log_oom();
2931                         goto finish;
2932                 }
2933
2934                 hostname_cleanup(arg_machine, false);
2935                 if (isempty(arg_machine)) {
2936                         log_error("Failed to determine machine name automatically, please use -M.");
2937                         goto finish;
2938                 }
2939         }
2940
2941         if (geteuid() != 0) {
2942                 log_error("Need to be root.");
2943                 goto finish;
2944         }
2945
2946         if (sd_booted() <= 0) {
2947                 log_error("Not running on a systemd system.");
2948                 goto finish;
2949         }
2950
2951         log_close();
2952         n_fd_passed = sd_listen_fds(false);
2953         if (n_fd_passed > 0) {
2954                 k = fdset_new_listen_fds(&fds, false);
2955                 if (k < 0) {
2956                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2957                         goto finish;
2958                 }
2959         }
2960         fdset_close_others(fds);
2961         log_open();
2962
2963         if (arg_directory) {
2964                 if (path_equal(arg_directory, "/")) {
2965                         log_error("Spawning container on root directory not supported.");
2966                         goto finish;
2967                 }
2968
2969                 if (arg_boot) {
2970                         if (path_is_os_tree(arg_directory) <= 0) {
2971                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2972                                 goto finish;
2973                         }
2974                 } else {
2975                         const char *p;
2976
2977                         p = strappenda(arg_directory,
2978                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2979                         if (access(p, F_OK) < 0) {
2980                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2981                                 goto finish;
2982
2983                         }
2984                 }
2985         } else {
2986                 char template[] = "/tmp/nspawn-root-XXXXXX";
2987
2988                 if (!mkdtemp(template)) {
2989                         log_error("Failed to create temporary directory: %m");
2990                         r = -errno;
2991                         goto finish;
2992                 }
2993
2994                 arg_directory = strdup(template);
2995                 if (!arg_directory) {
2996                         r = log_oom();
2997                         goto finish;
2998                 }
2999
3000                 image_fd = setup_image(&device_path, &loop_nr);
3001                 if (image_fd < 0) {
3002                         r = image_fd;
3003                         goto finish;
3004                 }
3005
3006                 r = dissect_image(image_fd,
3007                                   &root_device, &root_device_rw,
3008                                   &home_device, &home_device_rw,
3009                                   &srv_device, &srv_device_rw,
3010                                   &secondary);
3011                 if (r < 0)
3012                         goto finish;
3013         }
3014
3015         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3016         if (master < 0) {
3017                 log_error("Failed to acquire pseudo tty: %m");
3018                 goto finish;
3019         }
3020
3021         console = ptsname(master);
3022         if (!console) {
3023                 log_error("Failed to determine tty name: %m");
3024                 goto finish;
3025         }
3026
3027         if (!arg_quiet)
3028                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3029                          arg_machine, arg_image ? arg_image : arg_directory);
3030
3031         if (unlockpt(master) < 0) {
3032                 log_error("Failed to unlock tty: %m");
3033                 goto finish;
3034         }
3035
3036         if (access("/dev/kdbus/control", F_OK) >= 0) {
3037
3038                 if (arg_share_system) {
3039                         kdbus_domain = strdup("/dev/kdbus");
3040                         if (!kdbus_domain) {
3041                                 log_oom();
3042                                 goto finish;
3043                         }
3044                 } else {
3045                         const char *ns;
3046
3047                         ns = strappenda("machine-", arg_machine);
3048                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3049                         if (r < 0)
3050                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3051                         else
3052                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3053                 }
3054         }
3055
3056         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3057                 log_error("Failed to create kmsg socket pair: %m");
3058                 goto finish;
3059         }
3060
3061         sd_notify(0, "READY=1");
3062
3063         assert_se(sigemptyset(&mask) == 0);
3064         assert_se(sigemptyset(&mask_chld) == 0);
3065         sigaddset(&mask_chld, SIGCHLD);
3066         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3067         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3068
3069         for (;;) {
3070                 ContainerStatus container_status;
3071                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3072                 struct sigaction sa = {
3073                         .sa_handler = nop_handler,
3074                         .sa_flags = SA_NOCLDSTOP,
3075                 };
3076
3077                 r = barrier_create(&barrier);
3078                 if (r < 0) {
3079                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3080                         goto finish;
3081                 }
3082
3083                 /* Child can be killed before execv(), so handle SIGCHLD
3084                  * in order to interrupt parent's blocking calls and
3085                  * give it a chance to call wait() and terminate. */
3086                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3087                 if (r < 0) {
3088                         log_error("Failed to change the signal mask: %m");
3089                         goto finish;
3090                 }
3091
3092                 r = sigaction(SIGCHLD, &sa, NULL);
3093                 if (r < 0) {
3094                         log_error("Failed to install SIGCHLD handler: %m");
3095                         goto finish;
3096                 }
3097
3098                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3099                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3100                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3101                 if (pid < 0) {
3102                         if (errno == EINVAL)
3103                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3104                         else
3105                                 log_error("clone() failed: %m");
3106
3107                         r = pid;
3108                         goto finish;
3109                 }
3110
3111                 if (pid == 0) {
3112                         /* child */
3113                         _cleanup_free_ char *home = NULL;
3114                         unsigned n_env = 2;
3115                         const char *envp[] = {
3116                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3117                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3118                                 NULL, /* TERM */
3119                                 NULL, /* HOME */
3120                                 NULL, /* USER */
3121                                 NULL, /* LOGNAME */
3122                                 NULL, /* container_uuid */
3123                                 NULL, /* LISTEN_FDS */
3124                                 NULL, /* LISTEN_PID */
3125                                 NULL
3126                         };
3127                         char **env_use;
3128
3129                         barrier_set_role(&barrier, BARRIER_CHILD);
3130
3131                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3132                         if (envp[n_env])
3133                                 n_env ++;
3134
3135                         master = safe_close(master);
3136
3137                         close_nointr(STDIN_FILENO);
3138                         close_nointr(STDOUT_FILENO);
3139                         close_nointr(STDERR_FILENO);
3140
3141                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3142
3143                         reset_all_signal_handlers();
3144
3145                         assert_se(sigemptyset(&mask) == 0);
3146                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
3147
3148                         k = open_terminal(console, O_RDWR);
3149                         if (k != STDIN_FILENO) {
3150                                 if (k >= 0) {
3151                                         safe_close(k);
3152                                         k = -EINVAL;
3153                                 }
3154
3155                                 log_error("Failed to open console: %s", strerror(-k));
3156                                 _exit(EXIT_FAILURE);
3157                         }
3158
3159                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3160                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3161                                 log_error("Failed to duplicate console: %m");
3162                                 _exit(EXIT_FAILURE);
3163                         }
3164
3165                         if (setsid() < 0) {
3166                                 log_error("setsid() failed: %m");
3167                                 _exit(EXIT_FAILURE);
3168                         }
3169
3170                         if (reset_audit_loginuid() < 0)
3171                                 _exit(EXIT_FAILURE);
3172
3173                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3174                                 log_error("PR_SET_PDEATHSIG failed: %m");
3175                                 _exit(EXIT_FAILURE);
3176                         }
3177
3178                         /* Mark everything as slave, so that we still
3179                          * receive mounts from the real root, but don't
3180                          * propagate mounts to the real root. */
3181                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3182                                 log_error("MS_SLAVE|MS_REC failed: %m");
3183                                 _exit(EXIT_FAILURE);
3184                         }
3185
3186                         if (mount_devices(arg_directory,
3187                                           root_device, root_device_rw,
3188                                           home_device, home_device_rw,
3189                                           srv_device, srv_device_rw) < 0)
3190                                 _exit(EXIT_FAILURE);
3191
3192                         /* Turn directory into bind mount */
3193                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3194                                 log_error("Failed to make bind mount: %m");
3195                                 _exit(EXIT_FAILURE);
3196                         }
3197
3198                         r = setup_volatile(arg_directory);
3199                         if (r < 0)
3200                                 _exit(EXIT_FAILURE);
3201
3202                         if (setup_volatile_state(arg_directory) < 0)
3203                                 _exit(EXIT_FAILURE);
3204
3205                         r = base_filesystem_create(arg_directory);
3206                         if (r < 0)
3207                                 _exit(EXIT_FAILURE);
3208
3209                         if (arg_read_only) {
3210                                 k = bind_remount_recursive(arg_directory, true);
3211                                 if (k < 0) {
3212                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3213                                         _exit(EXIT_FAILURE);
3214                                 }
3215                         }
3216
3217                         if (mount_all(arg_directory) < 0)
3218                                 _exit(EXIT_FAILURE);
3219
3220                         if (copy_devnodes(arg_directory) < 0)
3221                                 _exit(EXIT_FAILURE);
3222
3223                         if (setup_ptmx(arg_directory) < 0)
3224                                 _exit(EXIT_FAILURE);
3225
3226                         dev_setup(arg_directory);
3227
3228                         if (setup_seccomp() < 0)
3229                                 _exit(EXIT_FAILURE);
3230
3231                         if (setup_dev_console(arg_directory, console) < 0)
3232                                 _exit(EXIT_FAILURE);
3233
3234                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3235                                 _exit(EXIT_FAILURE);
3236
3237                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3238
3239                         if (setup_boot_id(arg_directory) < 0)
3240                                 _exit(EXIT_FAILURE);
3241
3242                         if (setup_timezone(arg_directory) < 0)
3243                                 _exit(EXIT_FAILURE);
3244
3245                         if (setup_resolv_conf(arg_directory) < 0)
3246                                 _exit(EXIT_FAILURE);
3247
3248                         if (setup_journal(arg_directory) < 0)
3249                                 _exit(EXIT_FAILURE);
3250
3251                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3252                                 _exit(EXIT_FAILURE);
3253
3254                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3255                                 _exit(EXIT_FAILURE);
3256
3257                         if (mount_tmpfs(arg_directory) < 0)
3258                                 _exit(EXIT_FAILURE);
3259
3260                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3261                                 _exit(EXIT_FAILURE);
3262
3263                         /* Tell the parent that we are ready, and that
3264                          * it can cgroupify us to that we lack access
3265                          * to certain devices and resources. */
3266                         barrier_place(&barrier);
3267
3268                         if (chdir(arg_directory) < 0) {
3269                                 log_error("chdir(%s) failed: %m", arg_directory);
3270                                 _exit(EXIT_FAILURE);
3271                         }
3272
3273                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3274                                 log_error("mount(MS_MOVE) failed: %m");
3275                                 _exit(EXIT_FAILURE);
3276                         }
3277
3278                         if (chroot(".") < 0) {
3279                                 log_error("chroot() failed: %m");
3280                                 _exit(EXIT_FAILURE);
3281                         }
3282
3283                         if (chdir("/") < 0) {
3284                                 log_error("chdir() failed: %m");
3285                                 _exit(EXIT_FAILURE);
3286                         }
3287
3288                         umask(0022);
3289
3290                         if (arg_private_network)
3291                                 loopback_setup();
3292
3293                         if (drop_capabilities() < 0) {
3294                                 log_error("drop_capabilities() failed: %m");
3295                                 _exit(EXIT_FAILURE);
3296                         }
3297
3298                         r = change_uid_gid(&home);
3299                         if (r < 0)
3300                                 _exit(EXIT_FAILURE);
3301
3302                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3303                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3304                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3305                                 log_oom();
3306                                 _exit(EXIT_FAILURE);
3307                         }
3308
3309                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3310                                 char as_uuid[37];
3311
3312                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3313                                         log_oom();
3314                                         _exit(EXIT_FAILURE);
3315                                 }
3316                         }
3317
3318                         if (fdset_size(fds) > 0) {
3319                                 k = fdset_cloexec(fds, false);
3320                                 if (k < 0) {
3321                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3322                                         _exit(EXIT_FAILURE);
3323                                 }
3324
3325                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3326                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3327                                         log_oom();
3328                                         _exit(EXIT_FAILURE);
3329                                 }
3330                         }
3331
3332                         setup_hostname();
3333
3334                         if (arg_personality != 0xffffffffLU) {
3335                                 if (personality(arg_personality) < 0) {
3336                                         log_error("personality() failed: %m");
3337                                         _exit(EXIT_FAILURE);
3338                                 }
3339                         } else if (secondary) {
3340                                 if (personality(PER_LINUX32) < 0) {
3341                                         log_error("personality() failed: %m");
3342                                         _exit(EXIT_FAILURE);
3343                                 }
3344                         }
3345
3346 #ifdef HAVE_SELINUX
3347                         if (arg_selinux_context)
3348                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3349                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3350                                         _exit(EXIT_FAILURE);
3351                                 }
3352 #endif
3353
3354                         if (!strv_isempty(arg_setenv)) {
3355                                 char **n;
3356
3357                                 n = strv_env_merge(2, envp, arg_setenv);
3358                                 if (!n) {
3359                                         log_oom();
3360                                         _exit(EXIT_FAILURE);
3361                                 }
3362
3363                                 env_use = n;
3364                         } else
3365                                 env_use = (char**) envp;
3366
3367                         /* Wait until the parent is ready with the setup, too... */
3368                         if (!barrier_place_and_sync(&barrier))
3369                                 _exit(EXIT_FAILURE);
3370
3371                         if (arg_boot) {
3372                                 char **a;
3373                                 size_t l;
3374
3375                                 /* Automatically search for the init system */
3376
3377                                 l = 1 + argc - optind;
3378                                 a = newa(char*, l + 1);
3379                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3380
3381                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3382                                 execve(a[0], a, env_use);
3383
3384                                 a[0] = (char*) "/lib/systemd/systemd";
3385                                 execve(a[0], a, env_use);
3386
3387                                 a[0] = (char*) "/sbin/init";
3388                                 execve(a[0], a, env_use);
3389                         } else if (argc > optind)
3390                                 execvpe(argv[optind], argv + optind, env_use);
3391                         else {
3392                                 chdir(home ? home : "/root");
3393                                 execle("/bin/bash", "-bash", NULL, env_use);
3394                                 execle("/bin/sh", "-sh", NULL, env_use);
3395                         }
3396
3397                         log_error("execv() failed: %m");
3398                         _exit(EXIT_FAILURE);
3399                 }
3400
3401                 barrier_set_role(&barrier, BARRIER_PARENT);
3402                 fdset_free(fds);
3403                 fds = NULL;
3404
3405                 /* wait for child-setup to be done */
3406                 if (barrier_place_and_sync(&barrier)) {
3407                         int ifi = 0;
3408
3409                         r = move_network_interfaces(pid);
3410                         if (r < 0)
3411                                 goto finish;
3412
3413                         r = setup_veth(pid, veth_name, &ifi);
3414                         if (r < 0)
3415                                 goto finish;
3416
3417                         r = setup_bridge(veth_name, &ifi);
3418                         if (r < 0)
3419                                 goto finish;
3420
3421                         r = setup_macvlan(pid);
3422                         if (r < 0)
3423                                 goto finish;
3424
3425                         r = register_machine(pid, ifi);
3426                         if (r < 0)
3427                                 goto finish;
3428
3429                         /* Block SIGCHLD here, before notifying child.
3430                          * process_pty() will handle it with the other signals. */
3431                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3432                         if (r < 0)
3433                                 goto finish;
3434
3435                         /* Reset signal to default */
3436                         r = default_signals(SIGCHLD, -1);
3437                         if (r < 0)
3438                                 goto finish;
3439
3440                         /* Notify the child that the parent is ready with all
3441                          * its setup, and that the child can now hand over
3442                          * control to the code to run inside the container. */
3443                         barrier_place(&barrier);
3444
3445                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3446                         if (k < 0) {
3447                                 r = EXIT_FAILURE;
3448                                 break;
3449                         }
3450
3451                         if (!arg_quiet)
3452                                 putc('\n', stdout);
3453
3454                         /* Kill if it is not dead yet anyway */
3455                         terminate_machine(pid);
3456                 }
3457
3458                 /* Normally redundant, but better safe than sorry */
3459                 kill(pid, SIGKILL);
3460
3461                 r = wait_for_container(pid, &container_status);
3462                 pid = 0;
3463
3464                 if (r < 0) {
3465                         /* We failed to wait for the container, or the
3466                          * container exited abnormally */
3467                         r = EXIT_FAILURE;
3468                         break;
3469                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3470                         /* The container exited with a non-zero
3471                          * status, or with zero status and no reboot
3472                          * was requested. */
3473                         break;
3474
3475                 /* CONTAINER_REBOOTED, loop again */
3476
3477                 if (arg_keep_unit) {
3478                         /* Special handling if we are running as a
3479                          * service: instead of simply restarting the
3480                          * machine we want to restart the entire
3481                          * service, so let's inform systemd about this
3482                          * with the special exit code 133. The service
3483                          * file uses RestartForceExitStatus=133 so
3484                          * that this results in a full nspawn
3485                          * restart. This is necessary since we might
3486                          * have cgroup parameters set we want to have
3487                          * flushed out. */
3488                         r = 133;
3489                         break;
3490                 }
3491         }
3492
3493 finish:
3494         loop_remove(loop_nr, &image_fd);
3495
3496         if (pid > 0)
3497                 kill(pid, SIGKILL);
3498
3499         free(arg_directory);
3500         free(arg_machine);
3501         free(arg_user);
3502         strv_free(arg_setenv);
3503         strv_free(arg_network_interfaces);
3504         strv_free(arg_network_macvlan);
3505         strv_free(arg_bind);
3506         strv_free(arg_bind_ro);
3507         strv_free(arg_tmpfs);
3508
3509         return r;
3510 }