chiark / gitweb /
util: make use of newly added reset_signal_mask() call wherever appropriate
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92
93 #ifdef HAVE_SECCOMP
94 #include "seccomp-util.h"
95 #endif
96
97 typedef enum ContainerStatus {
98         CONTAINER_TERMINATED,
99         CONTAINER_REBOOTED
100 } ContainerStatus;
101
102 typedef enum LinkJournal {
103         LINK_NO,
104         LINK_AUTO,
105         LINK_HOST,
106         LINK_GUEST
107 } LinkJournal;
108
109 typedef enum Volatile {
110         VOLATILE_NO,
111         VOLATILE_YES,
112         VOLATILE_STATE,
113 } Volatile;
114
115 static char *arg_directory = NULL;
116 static char *arg_user = NULL;
117 static sd_id128_t arg_uuid = {};
118 static char *arg_machine = NULL;
119 static const char *arg_selinux_context = NULL;
120 static const char *arg_selinux_apifs_context = NULL;
121 static const char *arg_slice = NULL;
122 static bool arg_private_network = false;
123 static bool arg_read_only = false;
124 static bool arg_boot = false;
125 static LinkJournal arg_link_journal = LINK_AUTO;
126 static uint64_t arg_retain =
127         (1ULL << CAP_CHOWN) |
128         (1ULL << CAP_DAC_OVERRIDE) |
129         (1ULL << CAP_DAC_READ_SEARCH) |
130         (1ULL << CAP_FOWNER) |
131         (1ULL << CAP_FSETID) |
132         (1ULL << CAP_IPC_OWNER) |
133         (1ULL << CAP_KILL) |
134         (1ULL << CAP_LEASE) |
135         (1ULL << CAP_LINUX_IMMUTABLE) |
136         (1ULL << CAP_NET_BIND_SERVICE) |
137         (1ULL << CAP_NET_BROADCAST) |
138         (1ULL << CAP_NET_RAW) |
139         (1ULL << CAP_SETGID) |
140         (1ULL << CAP_SETFCAP) |
141         (1ULL << CAP_SETPCAP) |
142         (1ULL << CAP_SETUID) |
143         (1ULL << CAP_SYS_ADMIN) |
144         (1ULL << CAP_SYS_CHROOT) |
145         (1ULL << CAP_SYS_NICE) |
146         (1ULL << CAP_SYS_PTRACE) |
147         (1ULL << CAP_SYS_TTY_CONFIG) |
148         (1ULL << CAP_SYS_RESOURCE) |
149         (1ULL << CAP_SYS_BOOT) |
150         (1ULL << CAP_AUDIT_WRITE) |
151         (1ULL << CAP_AUDIT_CONTROL) |
152         (1ULL << CAP_MKNOD);
153 static char **arg_bind = NULL;
154 static char **arg_bind_ro = NULL;
155 static char **arg_tmpfs = NULL;
156 static char **arg_setenv = NULL;
157 static bool arg_quiet = false;
158 static bool arg_share_system = false;
159 static bool arg_register = true;
160 static bool arg_keep_unit = false;
161 static char **arg_network_interfaces = NULL;
162 static char **arg_network_macvlan = NULL;
163 static bool arg_network_veth = false;
164 static const char *arg_network_bridge = NULL;
165 static unsigned long arg_personality = 0xffffffffLU;
166 static const char *arg_image = NULL;
167 static Volatile arg_volatile = VOLATILE_NO;
168
169 static void help(void) {
170         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
171                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
172                "  -h --help                 Show this help\n"
173                "     --version              Print version string\n"
174                "  -q --quiet                Do not show status information\n"
175                "  -D --directory=PATH       Root directory for the container\n"
176                "  -i --image=PATH           File system device or image for the container\n"
177                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
178                "  -u --user=USER            Run the command under specified user or uid\n"
179                "  -M --machine=NAME         Set the machine name for the container\n"
180                "     --uuid=UUID            Set a specific machine UUID for the container\n"
181                "  -S --slice=SLICE          Place the container in the specified slice\n"
182                "     --private-network      Disable network in container\n"
183                "     --network-interface=INTERFACE\n"
184                "                            Assign an existing network interface to the\n"
185                "                            container\n"
186                "     --network-macvlan=INTERFACE\n"
187                "                            Create a macvlan network interface based on an\n"
188                "                            existing network interface to the container\n"
189                "     --network-veth         Add a virtual ethernet connection between host\n"
190                "                            and container\n"
191                "     --network-bridge=INTERFACE\n"
192                "                            Add a virtual ethernet connection between host\n"
193                "                            and container and add it to an existing bridge on\n"
194                "                            the host\n"
195                "  -Z --selinux-context=SECLABEL\n"
196                "                            Set the SELinux security context to be used by\n"
197                "                            processes in the container\n"
198                "  -L --selinux-apifs-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            API/tmpfs file systems in the container\n"
201                "     --capability=CAP       In addition to the default, retain specified\n"
202                "                            capability\n"
203                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
204                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
205                "  -j                        Equivalent to --link-journal=host\n"
206                "     --read-only            Mount the root directory read-only\n"
207                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
208                "                            the container\n"
209                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
210                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
211                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
212                "     --share-system         Share system namespaces with host\n"
213                "     --register=BOOLEAN     Register container as machine\n"
214                "     --keep-unit            Do not register a scope for the machine, reuse\n"
215                "                            the service unit nspawn is running in\n"
216                "     --volatile[=MODE]      Run the system in volatile mode\n",
217                program_invocation_short_name);
218 }
219
220 static int parse_argv(int argc, char *argv[]) {
221
222         enum {
223                 ARG_VERSION = 0x100,
224                 ARG_PRIVATE_NETWORK,
225                 ARG_UUID,
226                 ARG_READ_ONLY,
227                 ARG_CAPABILITY,
228                 ARG_DROP_CAPABILITY,
229                 ARG_LINK_JOURNAL,
230                 ARG_BIND,
231                 ARG_BIND_RO,
232                 ARG_TMPFS,
233                 ARG_SETENV,
234                 ARG_SHARE_SYSTEM,
235                 ARG_REGISTER,
236                 ARG_KEEP_UNIT,
237                 ARG_NETWORK_INTERFACE,
238                 ARG_NETWORK_MACVLAN,
239                 ARG_NETWORK_VETH,
240                 ARG_NETWORK_BRIDGE,
241                 ARG_PERSONALITY,
242                 ARG_VOLATILE,
243         };
244
245         static const struct option options[] = {
246                 { "help",                  no_argument,       NULL, 'h'                   },
247                 { "version",               no_argument,       NULL, ARG_VERSION           },
248                 { "directory",             required_argument, NULL, 'D'                   },
249                 { "user",                  required_argument, NULL, 'u'                   },
250                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
251                 { "boot",                  no_argument,       NULL, 'b'                   },
252                 { "uuid",                  required_argument, NULL, ARG_UUID              },
253                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
254                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
255                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
256                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
257                 { "bind",                  required_argument, NULL, ARG_BIND              },
258                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
259                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
260                 { "machine",               required_argument, NULL, 'M'                   },
261                 { "slice",                 required_argument, NULL, 'S'                   },
262                 { "setenv",                required_argument, NULL, ARG_SETENV            },
263                 { "selinux-context",       required_argument, NULL, 'Z'                   },
264                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
265                 { "quiet",                 no_argument,       NULL, 'q'                   },
266                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
267                 { "register",              required_argument, NULL, ARG_REGISTER          },
268                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
269                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
270                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
271                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
272                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
273                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
274                 { "image",                 required_argument, NULL, 'i'                   },
275                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
276                 {}
277         };
278
279         int c, r;
280         uint64_t plus = 0, minus = 0;
281
282         assert(argc >= 0);
283         assert(argv);
284
285         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
286
287                 switch (c) {
288
289                 case 'h':
290                         help();
291                         return 0;
292
293                 case ARG_VERSION:
294                         puts(PACKAGE_STRING);
295                         puts(SYSTEMD_FEATURES);
296                         return 0;
297
298                 case 'D':
299                         free(arg_directory);
300                         arg_directory = canonicalize_file_name(optarg);
301                         if (!arg_directory) {
302                                 log_error("Invalid root directory: %m");
303                                 return -ENOMEM;
304                         }
305
306                         break;
307
308                 case 'i':
309                         arg_image = optarg;
310                         break;
311
312                 case 'u':
313                         free(arg_user);
314                         arg_user = strdup(optarg);
315                         if (!arg_user)
316                                 return log_oom();
317
318                         break;
319
320                 case ARG_NETWORK_BRIDGE:
321                         arg_network_bridge = optarg;
322
323                         /* fall through */
324
325                 case ARG_NETWORK_VETH:
326                         arg_network_veth = true;
327                         arg_private_network = true;
328                         break;
329
330                 case ARG_NETWORK_INTERFACE:
331                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
332                                 return log_oom();
333
334                         arg_private_network = true;
335                         break;
336
337                 case ARG_NETWORK_MACVLAN:
338                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
339                                 return log_oom();
340
341                         /* fall through */
342
343                 case ARG_PRIVATE_NETWORK:
344                         arg_private_network = true;
345                         break;
346
347                 case 'b':
348                         arg_boot = true;
349                         break;
350
351                 case ARG_UUID:
352                         r = sd_id128_from_string(optarg, &arg_uuid);
353                         if (r < 0) {
354                                 log_error("Invalid UUID: %s", optarg);
355                                 return r;
356                         }
357                         break;
358
359                 case 'S':
360                         arg_slice = optarg;
361                         break;
362
363                 case 'M':
364                         if (isempty(optarg)) {
365                                 free(arg_machine);
366                                 arg_machine = NULL;
367                         } else {
368
369                                 if (!hostname_is_valid(optarg)) {
370                                         log_error("Invalid machine name: %s", optarg);
371                                         return -EINVAL;
372                                 }
373
374                                 free(arg_machine);
375                                 arg_machine = strdup(optarg);
376                                 if (!arg_machine)
377                                         return log_oom();
378
379                                 break;
380                         }
381
382                 case 'Z':
383                         arg_selinux_context = optarg;
384                         break;
385
386                 case 'L':
387                         arg_selinux_apifs_context = optarg;
388                         break;
389
390                 case ARG_READ_ONLY:
391                         arg_read_only = true;
392                         break;
393
394                 case ARG_CAPABILITY:
395                 case ARG_DROP_CAPABILITY: {
396                         const char *state, *word;
397                         size_t length;
398
399                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
400                                 _cleanup_free_ char *t;
401                                 cap_value_t cap;
402
403                                 t = strndup(word, length);
404                                 if (!t)
405                                         return log_oom();
406
407                                 if (streq(t, "all")) {
408                                         if (c == ARG_CAPABILITY)
409                                                 plus = (uint64_t) -1;
410                                         else
411                                                 minus = (uint64_t) -1;
412                                 } else {
413                                         if (cap_from_name(t, &cap) < 0) {
414                                                 log_error("Failed to parse capability %s.", t);
415                                                 return -EINVAL;
416                                         }
417
418                                         if (c == ARG_CAPABILITY)
419                                                 plus |= 1ULL << (uint64_t) cap;
420                                         else
421                                                 minus |= 1ULL << (uint64_t) cap;
422                                 }
423                         }
424
425                         break;
426                 }
427
428                 case 'j':
429                         arg_link_journal = LINK_GUEST;
430                         break;
431
432                 case ARG_LINK_JOURNAL:
433                         if (streq(optarg, "auto"))
434                                 arg_link_journal = LINK_AUTO;
435                         else if (streq(optarg, "no"))
436                                 arg_link_journal = LINK_NO;
437                         else if (streq(optarg, "guest"))
438                                 arg_link_journal = LINK_GUEST;
439                         else if (streq(optarg, "host"))
440                                 arg_link_journal = LINK_HOST;
441                         else {
442                                 log_error("Failed to parse link journal mode %s", optarg);
443                                 return -EINVAL;
444                         }
445
446                         break;
447
448                 case ARG_BIND:
449                 case ARG_BIND_RO: {
450                         _cleanup_free_ char *a = NULL, *b = NULL;
451                         char *e;
452                         char ***x;
453
454                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
455
456                         e = strchr(optarg, ':');
457                         if (e) {
458                                 a = strndup(optarg, e - optarg);
459                                 b = strdup(e + 1);
460                         } else {
461                                 a = strdup(optarg);
462                                 b = strdup(optarg);
463                         }
464
465                         if (!a || !b)
466                                 return log_oom();
467
468                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
469                                 log_error("Invalid bind mount specification: %s", optarg);
470                                 return -EINVAL;
471                         }
472
473                         r = strv_extend(x, a);
474                         if (r < 0)
475                                 return log_oom();
476
477                         r = strv_extend(x, b);
478                         if (r < 0)
479                                 return log_oom();
480
481                         break;
482                 }
483
484                 case ARG_TMPFS: {
485                         _cleanup_free_ char *a = NULL, *b = NULL;
486                         char *e;
487
488                         e = strchr(optarg, ':');
489                         if (e) {
490                                 a = strndup(optarg, e - optarg);
491                                 b = strdup(e + 1);
492                         } else {
493                                 a = strdup(optarg);
494                                 b = strdup("mode=0755");
495                         }
496
497                         if (!a || !b)
498                                 return log_oom();
499
500                         if (!path_is_absolute(a)) {
501                                 log_error("Invalid tmpfs specification: %s", optarg);
502                                 return -EINVAL;
503                         }
504
505                         r = strv_push(&arg_tmpfs, a);
506                         if (r < 0)
507                                 return log_oom();
508
509                         a = NULL;
510
511                         r = strv_push(&arg_tmpfs, b);
512                         if (r < 0)
513                                 return log_oom();
514
515                         b = NULL;
516
517                         break;
518                 }
519
520                 case ARG_SETENV: {
521                         char **n;
522
523                         if (!env_assignment_is_valid(optarg)) {
524                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
525                                 return -EINVAL;
526                         }
527
528                         n = strv_env_set(arg_setenv, optarg);
529                         if (!n)
530                                 return log_oom();
531
532                         strv_free(arg_setenv);
533                         arg_setenv = n;
534                         break;
535                 }
536
537                 case 'q':
538                         arg_quiet = true;
539                         break;
540
541                 case ARG_SHARE_SYSTEM:
542                         arg_share_system = true;
543                         break;
544
545                 case ARG_REGISTER:
546                         r = parse_boolean(optarg);
547                         if (r < 0) {
548                                 log_error("Failed to parse --register= argument: %s", optarg);
549                                 return r;
550                         }
551
552                         arg_register = r;
553                         break;
554
555                 case ARG_KEEP_UNIT:
556                         arg_keep_unit = true;
557                         break;
558
559                 case ARG_PERSONALITY:
560
561                         arg_personality = personality_from_string(optarg);
562                         if (arg_personality == 0xffffffffLU) {
563                                 log_error("Unknown or unsupported personality '%s'.", optarg);
564                                 return -EINVAL;
565                         }
566
567                         break;
568
569                 case ARG_VOLATILE:
570
571                         if (!optarg)
572                                 arg_volatile = VOLATILE_YES;
573                         else {
574                                 r = parse_boolean(optarg);
575                                 if (r < 0) {
576                                         if (streq(optarg, "state"))
577                                                 arg_volatile = VOLATILE_STATE;
578                                         else {
579                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
580                                                 return r;
581                                         }
582                                 } else
583                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
584                         }
585
586                         break;
587
588                 case '?':
589                         return -EINVAL;
590
591                 default:
592                         assert_not_reached("Unhandled option");
593                 }
594
595         if (arg_share_system)
596                 arg_register = false;
597
598         if (arg_boot && arg_share_system) {
599                 log_error("--boot and --share-system may not be combined.");
600                 return -EINVAL;
601         }
602
603         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
604                 log_error("--keep-unit may not be used when invoked from a user session.");
605                 return -EINVAL;
606         }
607
608         if (arg_directory && arg_image) {
609                 log_error("--directory= and --image= may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_volatile != VOLATILE_NO && arg_read_only) {
614                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
615                 return -EINVAL;
616         }
617
618         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
619
620         return 1;
621 }
622
623 static int mount_all(const char *dest) {
624
625         typedef struct MountPoint {
626                 const char *what;
627                 const char *where;
628                 const char *type;
629                 const char *options;
630                 unsigned long flags;
631                 bool fatal;
632         } MountPoint;
633
634         static const MountPoint mount_table[] = {
635                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
636                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
637                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
638                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
639                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
640                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
641                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
642                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
643 #ifdef HAVE_SELINUX
644                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
645                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
646 #endif
647         };
648
649         unsigned k;
650         int r = 0;
651
652         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
653                 _cleanup_free_ char *where = NULL;
654 #ifdef HAVE_SELINUX
655                 _cleanup_free_ char *options = NULL;
656 #endif
657                 const char *o;
658                 int t;
659
660                 where = strjoin(dest, "/", mount_table[k].where, NULL);
661                 if (!where)
662                         return log_oom();
663
664                 t = path_is_mount_point(where, true);
665                 if (t < 0) {
666                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
667
668                         if (r == 0)
669                                 r = t;
670
671                         continue;
672                 }
673
674                 /* Skip this entry if it is not a remount. */
675                 if (mount_table[k].what && t > 0)
676                         continue;
677
678                 mkdir_p(where, 0755);
679
680 #ifdef HAVE_SELINUX
681                 if (arg_selinux_apifs_context &&
682                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
683                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
684                         if (!options)
685                                 return log_oom();
686
687                         o = options;
688                 } else
689 #endif
690                         o = mount_table[k].options;
691
692
693                 if (mount(mount_table[k].what,
694                           where,
695                           mount_table[k].type,
696                           mount_table[k].flags,
697                           o) < 0 &&
698                     mount_table[k].fatal) {
699
700                         log_error("mount(%s) failed: %m", where);
701
702                         if (r == 0)
703                                 r = -errno;
704                 }
705         }
706
707         return r;
708 }
709
710 static int mount_binds(const char *dest, char **l, bool ro) {
711         char **x, **y;
712
713         STRV_FOREACH_PAIR(x, y, l) {
714                 _cleanup_free_ char *where = NULL;
715                 struct stat source_st, dest_st;
716                 int r;
717
718                 if (stat(*x, &source_st) < 0) {
719                         log_error("Failed to stat %s: %m", *x);
720                         return -errno;
721                 }
722
723                 where = strappend(dest, *y);
724                 if (!where)
725                         return log_oom();
726
727                 r = stat(where, &dest_st);
728                 if (r == 0) {
729                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
730                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
731                                 return -EINVAL;
732                         }
733                 } else if (errno == ENOENT) {
734                         r = mkdir_parents_label(where, 0755);
735                         if (r < 0) {
736                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
737                                 return r;
738                         }
739                 } else {
740                         log_error("Failed to bind mount %s: %m", *x);
741                         return -errno;
742                 }
743
744                 /* Create the mount point, but be conservative -- refuse to create block
745                  * and char devices. */
746                 if (S_ISDIR(source_st.st_mode))
747                         mkdir_label(where, 0755);
748                 else if (S_ISFIFO(source_st.st_mode))
749                         mkfifo(where, 0644);
750                 else if (S_ISSOCK(source_st.st_mode))
751                         mknod(where, 0644 | S_IFSOCK, 0);
752                 else if (S_ISREG(source_st.st_mode))
753                         touch(where);
754                 else {
755                         log_error("Refusing to create mountpoint for file: %s", *x);
756                         return -ENOTSUP;
757                 }
758
759                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
760                         log_error("mount(%s) failed: %m", where);
761                         return -errno;
762                 }
763
764                 if (ro) {
765                         r = bind_remount_recursive(where, true);
766                         if (r < 0) {
767                                 log_error("Read-Only bind mount failed: %s", strerror(-r));
768                                 return r;
769                         }
770                 }
771         }
772
773         return 0;
774 }
775
776 static int mount_tmpfs(const char *dest) {
777         char **i, **o;
778
779         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
780                 _cleanup_free_ char *where = NULL;
781
782                 where = strappend(dest, *i);
783                 if (!where)
784                         return log_oom();
785
786                 mkdir_label(where, 0755);
787
788                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
789                         log_error("tmpfs mount to %s failed: %m", where);
790                         return -errno;
791                 }
792         }
793
794         return 0;
795 }
796
797 static int setup_timezone(const char *dest) {
798         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
799         char *z, *y;
800         int r;
801
802         assert(dest);
803
804         /* Fix the timezone, if possible */
805         r = readlink_malloc("/etc/localtime", &p);
806         if (r < 0) {
807                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
808                 return 0;
809         }
810
811         z = path_startswith(p, "../usr/share/zoneinfo/");
812         if (!z)
813                 z = path_startswith(p, "/usr/share/zoneinfo/");
814         if (!z) {
815                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
816                 return 0;
817         }
818
819         where = strappend(dest, "/etc/localtime");
820         if (!where)
821                 return log_oom();
822
823         r = readlink_malloc(where, &q);
824         if (r >= 0) {
825                 y = path_startswith(q, "../usr/share/zoneinfo/");
826                 if (!y)
827                         y = path_startswith(q, "/usr/share/zoneinfo/");
828
829                 /* Already pointing to the right place? Then do nothing .. */
830                 if (y && streq(y, z))
831                         return 0;
832         }
833
834         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
835         if (!check)
836                 return log_oom();
837
838         if (access(check, F_OK) < 0) {
839                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
840                 return 0;
841         }
842
843         what = strappend("../usr/share/zoneinfo/", z);
844         if (!what)
845                 return log_oom();
846
847         mkdir_parents(where, 0755);
848         unlink(where);
849
850         if (symlink(what, where) < 0) {
851                 log_error("Failed to correct timezone of container: %m");
852                 return 0;
853         }
854
855         return 0;
856 }
857
858 static int setup_resolv_conf(const char *dest) {
859         _cleanup_free_ char *where = NULL;
860
861         assert(dest);
862
863         if (arg_private_network)
864                 return 0;
865
866         /* Fix resolv.conf, if possible */
867         where = strappend(dest, "/etc/resolv.conf");
868         if (!where)
869                 return log_oom();
870
871         /* We don't really care for the results of this really. If it
872          * fails, it fails, but meh... */
873         mkdir_parents(where, 0755);
874         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
875
876         return 0;
877 }
878
879 static int setup_volatile_state(const char *directory) {
880         const char *p;
881         int r;
882
883         assert(directory);
884
885         if (arg_volatile != VOLATILE_STATE)
886                 return 0;
887
888         /* --volatile=state means we simply overmount /var
889            with a tmpfs, and the rest read-only. */
890
891         r = bind_remount_recursive(directory, true);
892         if (r < 0) {
893                 log_error("Failed to remount %s read-only: %s", directory, strerror(-r));
894                 return r;
895         }
896
897         p = strappenda(directory, "/var");
898         mkdir(p, 0755);
899
900         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
901                 log_error("Failed to mount tmpfs to /var: %m");
902                 return -errno;
903         }
904
905         return 0;
906 }
907
908 static int setup_volatile(const char *directory) {
909         bool tmpfs_mounted = false, bind_mounted = false;
910         char template[] = "/tmp/nspawn-volatile-XXXXXX";
911         const char *f, *t;
912         int r;
913
914         assert(directory);
915
916         if (arg_volatile != VOLATILE_YES)
917                 return 0;
918
919         /* --volatile=yes means we mount a tmpfs to the root dir, and
920            the original /usr to use inside it, and that read-only. */
921
922         if (!mkdtemp(template)) {
923                 log_error("Failed to create temporary directory: %m");
924                 return -errno;
925         }
926
927         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
928                 log_error("Failed to mount tmpfs for root directory: %m");
929                 r = -errno;
930                 goto fail;
931         }
932
933         tmpfs_mounted = true;
934
935         f = strappenda(directory, "/usr");
936         t = strappenda(template, "/usr");
937
938         mkdir(t, 0755);
939         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
940                 log_error("Failed to create /usr bind mount: %m");
941                 r = -errno;
942                 goto fail;
943         }
944
945         bind_mounted = true;
946
947         r = bind_remount_recursive(t, true);
948         if (r < 0) {
949                 log_error("Failed to remount %s read-only: %s", t, strerror(-r));
950                 goto fail;
951         }
952
953         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
954                 log_error("Failed to move root mount: %m");
955                 r = -errno;
956                 goto fail;
957         }
958
959         rmdir(template);
960
961         return 0;
962
963 fail:
964         if (bind_mounted)
965                 umount(t);
966         if (tmpfs_mounted)
967                 umount(template);
968         rmdir(template);
969         return r;
970 }
971
972 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
973
974         snprintf(s, 37,
975                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
976                  SD_ID128_FORMAT_VAL(id));
977
978         return s;
979 }
980
981 static int setup_boot_id(const char *dest) {
982         _cleanup_free_ char *from = NULL, *to = NULL;
983         sd_id128_t rnd = {};
984         char as_uuid[37];
985         int r;
986
987         assert(dest);
988
989         if (arg_share_system)
990                 return 0;
991
992         /* Generate a new randomized boot ID, so that each boot-up of
993          * the container gets a new one */
994
995         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
996         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
997         if (!from || !to)
998                 return log_oom();
999
1000         r = sd_id128_randomize(&rnd);
1001         if (r < 0) {
1002                 log_error("Failed to generate random boot id: %s", strerror(-r));
1003                 return r;
1004         }
1005
1006         id128_format_as_uuid(rnd, as_uuid);
1007
1008         r = write_string_file(from, as_uuid);
1009         if (r < 0) {
1010                 log_error("Failed to write boot id: %s", strerror(-r));
1011                 return r;
1012         }
1013
1014         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1015                 log_error("Failed to bind mount boot id: %m");
1016                 r = -errno;
1017         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1018                 log_warning("Failed to make boot id read-only: %m");
1019
1020         unlink(from);
1021         return r;
1022 }
1023
1024 static int copy_devnodes(const char *dest) {
1025
1026         static const char devnodes[] =
1027                 "null\0"
1028                 "zero\0"
1029                 "full\0"
1030                 "random\0"
1031                 "urandom\0"
1032                 "tty\0";
1033
1034         const char *d;
1035         int r = 0;
1036         _cleanup_umask_ mode_t u;
1037
1038         assert(dest);
1039
1040         u = umask(0000);
1041
1042         NULSTR_FOREACH(d, devnodes) {
1043                 _cleanup_free_ char *from = NULL, *to = NULL;
1044                 struct stat st;
1045
1046                 from = strappend("/dev/", d);
1047                 to = strjoin(dest, "/dev/", d, NULL);
1048                 if (!from || !to)
1049                         return log_oom();
1050
1051                 if (stat(from, &st) < 0) {
1052
1053                         if (errno != ENOENT) {
1054                                 log_error("Failed to stat %s: %m", from);
1055                                 return -errno;
1056                         }
1057
1058                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1059
1060                         log_error("%s is not a char or block device, cannot copy", from);
1061                         return -EIO;
1062
1063                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1064
1065                         log_error("mknod(%s) failed: %m", dest);
1066                         return  -errno;
1067                 }
1068         }
1069
1070         return r;
1071 }
1072
1073 static int setup_ptmx(const char *dest) {
1074         _cleanup_free_ char *p = NULL;
1075
1076         p = strappend(dest, "/dev/ptmx");
1077         if (!p)
1078                 return log_oom();
1079
1080         if (symlink("pts/ptmx", p) < 0) {
1081                 log_error("Failed to create /dev/ptmx symlink: %m");
1082                 return -errno;
1083         }
1084
1085         return 0;
1086 }
1087
1088 static int setup_dev_console(const char *dest, const char *console) {
1089         _cleanup_umask_ mode_t u;
1090         const char *to;
1091         struct stat st;
1092         int r;
1093
1094         assert(dest);
1095         assert(console);
1096
1097         u = umask(0000);
1098
1099         if (stat("/dev/null", &st) < 0) {
1100                 log_error("Failed to stat /dev/null: %m");
1101                 return -errno;
1102         }
1103
1104         r = chmod_and_chown(console, 0600, 0, 0);
1105         if (r < 0) {
1106                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
1107                 return r;
1108         }
1109
1110         /* We need to bind mount the right tty to /dev/console since
1111          * ptys can only exist on pts file systems. To have something
1112          * to bind mount things on we create a device node first, and
1113          * use /dev/null for that since we the cgroups device policy
1114          * allows us to create that freely, while we cannot create
1115          * /dev/console. (Note that the major minor doesn't actually
1116          * matter here, since we mount it over anyway). */
1117
1118         to = strappenda(dest, "/dev/console");
1119         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1120                 log_error("mknod() for /dev/console failed: %m");
1121                 return -errno;
1122         }
1123
1124         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1125                 log_error("Bind mount for /dev/console failed: %m");
1126                 return -errno;
1127         }
1128
1129         return 0;
1130 }
1131
1132 static int setup_kmsg(const char *dest, int kmsg_socket) {
1133         _cleanup_free_ char *from = NULL, *to = NULL;
1134         int r, fd, k;
1135         _cleanup_umask_ mode_t u;
1136         union {
1137                 struct cmsghdr cmsghdr;
1138                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1139         } control = {};
1140         struct msghdr mh = {
1141                 .msg_control = &control,
1142                 .msg_controllen = sizeof(control),
1143         };
1144         struct cmsghdr *cmsg;
1145
1146         assert(dest);
1147         assert(kmsg_socket >= 0);
1148
1149         u = umask(0000);
1150
1151         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1152          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1153          * on the reading side behave very similar to /proc/kmsg,
1154          * their writing side behaves differently from /dev/kmsg in
1155          * that writing blocks when nothing is reading. In order to
1156          * avoid any problems with containers deadlocking due to this
1157          * we simply make /dev/kmsg unavailable to the container. */
1158         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1159             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1160                 return log_oom();
1161
1162         if (mkfifo(from, 0600) < 0) {
1163                 log_error("mkfifo() for /dev/kmsg failed: %m");
1164                 return -errno;
1165         }
1166
1167         r = chmod_and_chown(from, 0600, 0, 0);
1168         if (r < 0) {
1169                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
1170                 return r;
1171         }
1172
1173         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1174                 log_error("Bind mount for /proc/kmsg failed: %m");
1175                 return -errno;
1176         }
1177
1178         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1179         if (fd < 0) {
1180                 log_error("Failed to open fifo: %m");
1181                 return -errno;
1182         }
1183
1184         cmsg = CMSG_FIRSTHDR(&mh);
1185         cmsg->cmsg_level = SOL_SOCKET;
1186         cmsg->cmsg_type = SCM_RIGHTS;
1187         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1188         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1189
1190         mh.msg_controllen = cmsg->cmsg_len;
1191
1192         /* Store away the fd in the socket, so that it stays open as
1193          * long as we run the child */
1194         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1195         safe_close(fd);
1196
1197         if (k < 0) {
1198                 log_error("Failed to send FIFO fd: %m");
1199                 return -errno;
1200         }
1201
1202         /* And now make the FIFO unavailable as /dev/kmsg... */
1203         unlink(from);
1204         return 0;
1205 }
1206
1207 static int setup_hostname(void) {
1208
1209         if (arg_share_system)
1210                 return 0;
1211
1212         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1213                 return -errno;
1214
1215         return 0;
1216 }
1217
1218 static int setup_journal(const char *directory) {
1219         sd_id128_t machine_id, this_id;
1220         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1221         char *id;
1222         int r;
1223
1224         p = strappend(directory, "/etc/machine-id");
1225         if (!p)
1226                 return log_oom();
1227
1228         r = read_one_line_file(p, &b);
1229         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1230                 return 0;
1231         else if (r < 0) {
1232                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1233                 return r;
1234         }
1235
1236         id = strstrip(b);
1237         if (isempty(id) && arg_link_journal == LINK_AUTO)
1238                 return 0;
1239
1240         /* Verify validity */
1241         r = sd_id128_from_string(id, &machine_id);
1242         if (r < 0) {
1243                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1244                 return r;
1245         }
1246
1247         r = sd_id128_get_machine(&this_id);
1248         if (r < 0) {
1249                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1250                 return r;
1251         }
1252
1253         if (sd_id128_equal(machine_id, this_id)) {
1254                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1255                          "Host and machine ids are equal (%s): refusing to link journals", id);
1256                 if (arg_link_journal == LINK_AUTO)
1257                         return 0;
1258                 return
1259                         -EEXIST;
1260         }
1261
1262         if (arg_link_journal == LINK_NO)
1263                 return 0;
1264
1265         free(p);
1266         p = strappend("/var/log/journal/", id);
1267         q = strjoin(directory, "/var/log/journal/", id, NULL);
1268         if (!p || !q)
1269                 return log_oom();
1270
1271         if (path_is_mount_point(p, false) > 0) {
1272                 if (arg_link_journal != LINK_AUTO) {
1273                         log_error("%s: already a mount point, refusing to use for journal", p);
1274                         return -EEXIST;
1275                 }
1276
1277                 return 0;
1278         }
1279
1280         if (path_is_mount_point(q, false) > 0) {
1281                 if (arg_link_journal != LINK_AUTO) {
1282                         log_error("%s: already a mount point, refusing to use for journal", q);
1283                         return -EEXIST;
1284                 }
1285
1286                 return 0;
1287         }
1288
1289         r = readlink_and_make_absolute(p, &d);
1290         if (r >= 0) {
1291                 if ((arg_link_journal == LINK_GUEST ||
1292                      arg_link_journal == LINK_AUTO) &&
1293                     path_equal(d, q)) {
1294
1295                         r = mkdir_p(q, 0755);
1296                         if (r < 0)
1297                                 log_warning("failed to create directory %s: %m", q);
1298                         return 0;
1299                 }
1300
1301                 if (unlink(p) < 0) {
1302                         log_error("Failed to remove symlink %s: %m", p);
1303                         return -errno;
1304                 }
1305         } else if (r == -EINVAL) {
1306
1307                 if (arg_link_journal == LINK_GUEST &&
1308                     rmdir(p) < 0) {
1309
1310                         if (errno == ENOTDIR) {
1311                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1312                                 return r;
1313                         } else {
1314                                 log_error("Failed to remove %s: %m", p);
1315                                 return -errno;
1316                         }
1317                 }
1318         } else if (r != -ENOENT) {
1319                 log_error("readlink(%s) failed: %m", p);
1320                 return r;
1321         }
1322
1323         if (arg_link_journal == LINK_GUEST) {
1324
1325                 if (symlink(q, p) < 0) {
1326                         log_error("Failed to symlink %s to %s: %m", q, p);
1327                         return -errno;
1328                 }
1329
1330                 r = mkdir_p(q, 0755);
1331                 if (r < 0)
1332                         log_warning("failed to create directory %s: %m", q);
1333                 return 0;
1334         }
1335
1336         if (arg_link_journal == LINK_HOST) {
1337                 r = mkdir_p(p, 0755);
1338                 if (r < 0) {
1339                         log_error("Failed to create %s: %m", p);
1340                         return r;
1341                 }
1342
1343         } else if (access(p, F_OK) < 0)
1344                 return 0;
1345
1346         if (dir_is_empty(q) == 0)
1347                 log_warning("%s is not empty, proceeding anyway.", q);
1348
1349         r = mkdir_p(q, 0755);
1350         if (r < 0) {
1351                 log_error("Failed to create %s: %m", q);
1352                 return r;
1353         }
1354
1355         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1356                 log_error("Failed to bind mount journal from host into guest: %m");
1357                 return -errno;
1358         }
1359
1360         return 0;
1361 }
1362
1363 static int setup_kdbus(const char *dest, const char *path) {
1364         const char *p;
1365
1366         if (!path)
1367                 return 0;
1368
1369         p = strappenda(dest, "/dev/kdbus");
1370         if (mkdir(p, 0755) < 0) {
1371                 log_error("Failed to create kdbus path: %m");
1372                 return  -errno;
1373         }
1374
1375         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1376                 log_error("Failed to mount kdbus domain path: %m");
1377                 return -errno;
1378         }
1379
1380         return 0;
1381 }
1382
1383 static int drop_capabilities(void) {
1384         return capability_bounding_set_drop(~arg_retain, false);
1385 }
1386
1387 static int register_machine(pid_t pid, int local_ifindex) {
1388         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1389         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1390         int r;
1391
1392         if (!arg_register)
1393                 return 0;
1394
1395         r = sd_bus_default_system(&bus);
1396         if (r < 0) {
1397                 log_error("Failed to open system bus: %s", strerror(-r));
1398                 return r;
1399         }
1400
1401         if (arg_keep_unit) {
1402                 r = sd_bus_call_method(
1403                                 bus,
1404                                 "org.freedesktop.machine1",
1405                                 "/org/freedesktop/machine1",
1406                                 "org.freedesktop.machine1.Manager",
1407                                 "RegisterMachineWithNetwork",
1408                                 &error,
1409                                 NULL,
1410                                 "sayssusai",
1411                                 arg_machine,
1412                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1413                                 "nspawn",
1414                                 "container",
1415                                 (uint32_t) pid,
1416                                 strempty(arg_directory),
1417                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1418         } else {
1419                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1420
1421                 r = sd_bus_message_new_method_call(
1422                                 bus,
1423                                 &m,
1424                                 "org.freedesktop.machine1",
1425                                 "/org/freedesktop/machine1",
1426                                 "org.freedesktop.machine1.Manager",
1427                                 "CreateMachineWithNetwork");
1428                 if (r < 0) {
1429                         log_error("Failed to create message: %s", strerror(-r));
1430                         return r;
1431                 }
1432
1433                 r = sd_bus_message_append(
1434                                 m,
1435                                 "sayssusai",
1436                                 arg_machine,
1437                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1438                                 "nspawn",
1439                                 "container",
1440                                 (uint32_t) pid,
1441                                 strempty(arg_directory),
1442                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1443                 if (r < 0) {
1444                         log_error("Failed to append message arguments: %s", strerror(-r));
1445                         return r;
1446                 }
1447
1448                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1449                 if (r < 0) {
1450                         log_error("Failed to open container: %s", strerror(-r));
1451                         return r;
1452                 }
1453
1454                 if (!isempty(arg_slice)) {
1455                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1456                         if (r < 0) {
1457                                 log_error("Failed to append slice: %s", strerror(-r));
1458                                 return r;
1459                         }
1460                 }
1461
1462                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1463                 if (r < 0) {
1464                         log_error("Failed to add device policy: %s", strerror(-r));
1465                         return r;
1466                 }
1467
1468                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1469                                           /* Allow the container to
1470                                            * access and create the API
1471                                            * device nodes, so that
1472                                            * PrivateDevices= in the
1473                                            * container can work
1474                                            * fine */
1475                                           "/dev/null", "rwm",
1476                                           "/dev/zero", "rwm",
1477                                           "/dev/full", "rwm",
1478                                           "/dev/random", "rwm",
1479                                           "/dev/urandom", "rwm",
1480                                           "/dev/tty", "rwm",
1481                                           /* Allow the container
1482                                            * access to ptys. However,
1483                                            * do not permit the
1484                                            * container to ever create
1485                                            * these device nodes. */
1486                                           "/dev/pts/ptmx", "rw",
1487                                           "char-pts", "rw",
1488                                           /* Allow the container
1489                                            * access to all kdbus
1490                                            * devices. Again, the
1491                                            * container cannot create
1492                                            * these nodes, only use
1493                                            * them. We use a pretty
1494                                            * open match here, so that
1495                                            * the kernel API can still
1496                                            * change. */
1497                                           "char-kdbus", "rw",
1498                                           "char-kdbus/*", "rw");
1499                 if (r < 0) {
1500                         log_error("Failed to add device whitelist: %s", strerror(-r));
1501                         return r;
1502                 }
1503
1504                 r = sd_bus_message_close_container(m);
1505                 if (r < 0) {
1506                         log_error("Failed to close container: %s", strerror(-r));
1507                         return r;
1508                 }
1509
1510                 r = sd_bus_call(bus, m, 0, &error, NULL);
1511         }
1512
1513         if (r < 0) {
1514                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1515                 return r;
1516         }
1517
1518         return 0;
1519 }
1520
1521 static int terminate_machine(pid_t pid) {
1522         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1523         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1524         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1525         const char *path;
1526         int r;
1527
1528         if (!arg_register)
1529                 return 0;
1530
1531         r = sd_bus_default_system(&bus);
1532         if (r < 0) {
1533                 log_error("Failed to open system bus: %s", strerror(-r));
1534                 return r;
1535         }
1536
1537         r = sd_bus_call_method(
1538                         bus,
1539                         "org.freedesktop.machine1",
1540                         "/org/freedesktop/machine1",
1541                         "org.freedesktop.machine1.Manager",
1542                         "GetMachineByPID",
1543                         &error,
1544                         &reply,
1545                         "u",
1546                         (uint32_t) pid);
1547         if (r < 0) {
1548                 /* Note that the machine might already have been
1549                  * cleaned up automatically, hence don't consider it a
1550                  * failure if we cannot get the machine object. */
1551                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1552                 return 0;
1553         }
1554
1555         r = sd_bus_message_read(reply, "o", &path);
1556         if (r < 0)
1557                 return bus_log_parse_error(r);
1558
1559         r = sd_bus_call_method(
1560                         bus,
1561                         "org.freedesktop.machine1",
1562                         path,
1563                         "org.freedesktop.machine1.Machine",
1564                         "Terminate",
1565                         &error,
1566                         NULL,
1567                         NULL);
1568         if (r < 0) {
1569                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1570                 return 0;
1571         }
1572
1573         return 0;
1574 }
1575
1576 static int reset_audit_loginuid(void) {
1577         _cleanup_free_ char *p = NULL;
1578         int r;
1579
1580         if (arg_share_system)
1581                 return 0;
1582
1583         r = read_one_line_file("/proc/self/loginuid", &p);
1584         if (r == -ENOENT)
1585                 return 0;
1586         if (r < 0) {
1587                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1588                 return r;
1589         }
1590
1591         /* Already reset? */
1592         if (streq(p, "4294967295"))
1593                 return 0;
1594
1595         r = write_string_file("/proc/self/loginuid", "4294967295");
1596         if (r < 0) {
1597                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1598                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1599                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1600                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1601                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1602
1603                 sleep(5);
1604         }
1605
1606         return 0;
1607 }
1608
1609 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1610 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1611
1612 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1613         int r;
1614
1615         uint8_t result[8];
1616         size_t l, sz;
1617         uint8_t *v;
1618
1619         l = strlen(arg_machine);
1620         sz = sizeof(sd_id128_t) + l;
1621         v = alloca(sz);
1622
1623         /* fetch some persistent data unique to the host */
1624         r = sd_id128_get_machine((sd_id128_t*) v);
1625         if (r < 0)
1626                 return r;
1627
1628         /* combine with some data unique (on this host) to this
1629          * container instance */
1630         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1631
1632         /* Let's hash the host machine ID plus the container name. We
1633          * use a fixed, but originally randomly created hash key here. */
1634         siphash24(result, v, sz, hash_key.bytes);
1635
1636         assert_cc(ETH_ALEN <= sizeof(result));
1637         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1638
1639         /* see eth_random_addr in the kernel */
1640         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1641         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1642
1643         return 0;
1644 }
1645
1646 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1647         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1648         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1649         struct ether_addr mac_host, mac_container;
1650         int r, i;
1651
1652         if (!arg_private_network)
1653                 return 0;
1654
1655         if (!arg_network_veth)
1656                 return 0;
1657
1658         /* Use two different interface name prefixes depending whether
1659          * we are in bridge mode or not. */
1660         snprintf(iface_name, IFNAMSIZ, "%s-%s",
1661                  arg_network_bridge ? "vb" : "ve", arg_machine);
1662
1663         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1664         if (r < 0) {
1665                 log_error("Failed to generate predictable MAC address for container side");
1666                 return r;
1667         }
1668
1669         r = generate_mac(&mac_host, HOST_HASH_KEY);
1670         if (r < 0) {
1671                 log_error("Failed to generate predictable MAC address for host side");
1672                 return r;
1673         }
1674
1675         r = sd_rtnl_open(&rtnl, 0);
1676         if (r < 0) {
1677                 log_error("Failed to connect to netlink: %s", strerror(-r));
1678                 return r;
1679         }
1680
1681         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1682         if (r < 0) {
1683                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1684                 return r;
1685         }
1686
1687         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1688         if (r < 0) {
1689                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1690                 return r;
1691         }
1692
1693         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1694         if (r < 0) {
1695                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1696                 return r;
1697         }
1698
1699         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1700         if (r < 0) {
1701                 log_error("Failed to open netlink container: %s", strerror(-r));
1702                 return r;
1703         }
1704
1705         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1706         if (r < 0) {
1707                 log_error("Failed to open netlink container: %s", strerror(-r));
1708                 return r;
1709         }
1710
1711         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1712         if (r < 0) {
1713                 log_error("Failed to open netlink container: %s", strerror(-r));
1714                 return r;
1715         }
1716
1717         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1718         if (r < 0) {
1719                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1720                 return r;
1721         }
1722
1723         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1724         if (r < 0) {
1725                 log_error("Failed to add netlink MAC address: %s", strerror(-r));
1726                 return r;
1727         }
1728
1729         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1730         if (r < 0) {
1731                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1732                 return r;
1733         }
1734
1735         r = sd_rtnl_message_close_container(m);
1736         if (r < 0) {
1737                 log_error("Failed to close netlink container: %s", strerror(-r));
1738                 return r;
1739         }
1740
1741         r = sd_rtnl_message_close_container(m);
1742         if (r < 0) {
1743                 log_error("Failed to close netlink container: %s", strerror(-r));
1744                 return r;
1745         }
1746
1747         r = sd_rtnl_message_close_container(m);
1748         if (r < 0) {
1749                 log_error("Failed to close netlink container: %s", strerror(-r));
1750                 return r;
1751         }
1752
1753         r = sd_rtnl_call(rtnl, m, 0, NULL);
1754         if (r < 0) {
1755                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1756                 return r;
1757         }
1758
1759         i = (int) if_nametoindex(iface_name);
1760         if (i <= 0) {
1761                 log_error("Failed to resolve interface %s: %m", iface_name);
1762                 return -errno;
1763         }
1764
1765         *ifi = i;
1766
1767         return 0;
1768 }
1769
1770 static int setup_bridge(const char veth_name[], int *ifi) {
1771         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1772         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1773         int r, bridge;
1774
1775         if (!arg_private_network)
1776                 return 0;
1777
1778         if (!arg_network_veth)
1779                 return 0;
1780
1781         if (!arg_network_bridge)
1782                 return 0;
1783
1784         bridge = (int) if_nametoindex(arg_network_bridge);
1785         if (bridge <= 0) {
1786                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1787                 return -errno;
1788         }
1789
1790         *ifi = bridge;
1791
1792         r = sd_rtnl_open(&rtnl, 0);
1793         if (r < 0) {
1794                 log_error("Failed to connect to netlink: %s", strerror(-r));
1795                 return r;
1796         }
1797
1798         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1799         if (r < 0) {
1800                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1801                 return r;
1802         }
1803
1804         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1805         if (r < 0) {
1806                 log_error("Failed to set IFF_UP flag: %s", strerror(-r));
1807                 return r;
1808         }
1809
1810         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1811         if (r < 0) {
1812                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1813                 return r;
1814         }
1815
1816         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1817         if (r < 0) {
1818                 log_error("Failed to add netlink master field: %s", strerror(-r));
1819                 return r;
1820         }
1821
1822         r = sd_rtnl_call(rtnl, m, 0, NULL);
1823         if (r < 0) {
1824                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1825                 return r;
1826         }
1827
1828         return 0;
1829 }
1830
1831 static int parse_interface(struct udev *udev, const char *name) {
1832         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1833         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1834         int ifi;
1835
1836         ifi = (int) if_nametoindex(name);
1837         if (ifi <= 0) {
1838                 log_error("Failed to resolve interface %s: %m", name);
1839                 return -errno;
1840         }
1841
1842         sprintf(ifi_str, "n%i", ifi);
1843         d = udev_device_new_from_device_id(udev, ifi_str);
1844         if (!d) {
1845                 log_error("Failed to get udev device for interface %s: %m", name);
1846                 return -errno;
1847         }
1848
1849         if (udev_device_get_is_initialized(d) <= 0) {
1850                 log_error("Network interface %s is not initialized yet.", name);
1851                 return -EBUSY;
1852         }
1853
1854         return ifi;
1855 }
1856
1857 static int move_network_interfaces(pid_t pid) {
1858         _cleanup_udev_unref_ struct udev *udev = NULL;
1859         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1860         char **i;
1861         int r;
1862
1863         if (!arg_private_network)
1864                 return 0;
1865
1866         if (strv_isempty(arg_network_interfaces))
1867                 return 0;
1868
1869         r = sd_rtnl_open(&rtnl, 0);
1870         if (r < 0) {
1871                 log_error("Failed to connect to netlink: %s", strerror(-r));
1872                 return r;
1873         }
1874
1875         udev = udev_new();
1876         if (!udev) {
1877                 log_error("Failed to connect to udev.");
1878                 return -ENOMEM;
1879         }
1880
1881         STRV_FOREACH(i, arg_network_interfaces) {
1882                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1883                 int ifi;
1884
1885                 ifi = parse_interface(udev, *i);
1886                 if (ifi < 0)
1887                         return ifi;
1888
1889                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1890                 if (r < 0) {
1891                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1892                         return r;
1893                 }
1894
1895                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1896                 if (r < 0) {
1897                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1898                         return r;
1899                 }
1900
1901                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1902                 if (r < 0) {
1903                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1904                         return r;
1905                 }
1906         }
1907
1908         return 0;
1909 }
1910
1911 static int setup_macvlan(pid_t pid) {
1912         _cleanup_udev_unref_ struct udev *udev = NULL;
1913         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1914         char **i;
1915         int r;
1916
1917         if (!arg_private_network)
1918                 return 0;
1919
1920         if (strv_isempty(arg_network_macvlan))
1921                 return 0;
1922
1923         r = sd_rtnl_open(&rtnl, 0);
1924         if (r < 0) {
1925                 log_error("Failed to connect to netlink: %s", strerror(-r));
1926                 return r;
1927         }
1928
1929         udev = udev_new();
1930         if (!udev) {
1931                 log_error("Failed to connect to udev.");
1932                 return -ENOMEM;
1933         }
1934
1935         STRV_FOREACH(i, arg_network_macvlan) {
1936                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1937                 _cleanup_free_ char *n = NULL;
1938                 int ifi;
1939
1940                 ifi = parse_interface(udev, *i);
1941                 if (ifi < 0)
1942                         return ifi;
1943
1944                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1945                 if (r < 0) {
1946                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1947                         return r;
1948                 }
1949
1950                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1951                 if (r < 0) {
1952                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1953                         return r;
1954                 }
1955
1956                 n = strappend("mv-", *i);
1957                 if (!n)
1958                         return log_oom();
1959
1960                 strshorten(n, IFNAMSIZ-1);
1961
1962                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1963                 if (r < 0) {
1964                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1965                         return r;
1966                 }
1967
1968                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1969                 if (r < 0) {
1970                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1971                         return r;
1972                 }
1973
1974                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1975                 if (r < 0) {
1976                         log_error("Failed to open netlink container: %s", strerror(-r));
1977                         return r;
1978                 }
1979
1980                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1981                 if (r < 0) {
1982                         log_error("Failed to open netlink container: %s", strerror(-r));
1983                         return r;
1984                 }
1985
1986                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1987                 if (r < 0) {
1988                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1989                         return r;
1990                 }
1991
1992                 r = sd_rtnl_message_close_container(m);
1993                 if (r < 0) {
1994                         log_error("Failed to close netlink container: %s", strerror(-r));
1995                         return r;
1996                 }
1997
1998                 r = sd_rtnl_message_close_container(m);
1999                 if (r < 0) {
2000                         log_error("Failed to close netlink container: %s", strerror(-r));
2001                         return r;
2002                 }
2003
2004                 r = sd_rtnl_call(rtnl, m, 0, NULL);
2005                 if (r < 0) {
2006                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
2007                         return r;
2008                 }
2009         }
2010
2011         return 0;
2012 }
2013
2014 static int setup_seccomp(void) {
2015
2016 #ifdef HAVE_SECCOMP
2017         static const int blacklist[] = {
2018                 SCMP_SYS(kexec_load),
2019                 SCMP_SYS(open_by_handle_at),
2020                 SCMP_SYS(init_module),
2021                 SCMP_SYS(finit_module),
2022                 SCMP_SYS(delete_module),
2023                 SCMP_SYS(iopl),
2024                 SCMP_SYS(ioperm),
2025                 SCMP_SYS(swapon),
2026                 SCMP_SYS(swapoff),
2027         };
2028
2029         scmp_filter_ctx seccomp;
2030         unsigned i;
2031         int r;
2032
2033         seccomp = seccomp_init(SCMP_ACT_ALLOW);
2034         if (!seccomp)
2035                 return log_oom();
2036
2037         r = seccomp_add_secondary_archs(seccomp);
2038         if (r < 0) {
2039                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
2040                 goto finish;
2041         }
2042
2043         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
2044                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
2045                 if (r == -EFAULT)
2046                         continue; /* unknown syscall */
2047                 if (r < 0) {
2048                         log_error("Failed to block syscall: %s", strerror(-r));
2049                         goto finish;
2050                 }
2051         }
2052
2053         /*
2054            Audit is broken in containers, much of the userspace audit
2055            hookup will fail if running inside a container. We don't
2056            care and just turn off creation of audit sockets.
2057
2058            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2059            with EAFNOSUPPORT which audit userspace uses as indication
2060            that audit is disabled in the kernel.
2061          */
2062
2063         r = seccomp_rule_add(
2064                         seccomp,
2065                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2066                         SCMP_SYS(socket),
2067                         2,
2068                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2069                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2070         if (r < 0) {
2071                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
2072                 goto finish;
2073         }
2074
2075         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2076         if (r < 0) {
2077                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
2078                 goto finish;
2079         }
2080
2081         r = seccomp_load(seccomp);
2082         if (r < 0)
2083                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
2084
2085 finish:
2086         seccomp_release(seccomp);
2087         return r;
2088 #else
2089         return 0;
2090 #endif
2091
2092 }
2093
2094 static int setup_image(char **device_path, int *loop_nr) {
2095         struct loop_info64 info = {
2096                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2097         };
2098         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2099         _cleanup_free_ char* loopdev = NULL;
2100         struct stat st;
2101         int r, nr;
2102
2103         assert(device_path);
2104         assert(loop_nr);
2105
2106         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2107         if (fd < 0) {
2108                 log_error("Failed to open %s: %m", arg_image);
2109                 return -errno;
2110         }
2111
2112         if (fstat(fd, &st) < 0) {
2113                 log_error("Failed to stat %s: %m", arg_image);
2114                 return -errno;
2115         }
2116
2117         if (S_ISBLK(st.st_mode)) {
2118                 char *p;
2119
2120                 p = strdup(arg_image);
2121                 if (!p)
2122                         return log_oom();
2123
2124                 *device_path = p;
2125
2126                 *loop_nr = -1;
2127
2128                 r = fd;
2129                 fd = -1;
2130
2131                 return r;
2132         }
2133
2134         if (!S_ISREG(st.st_mode)) {
2135                 log_error("%s is not a regular file or block device: %m", arg_image);
2136                 return -EINVAL;
2137         }
2138
2139         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2140         if (control < 0) {
2141                 log_error("Failed to open /dev/loop-control: %m");
2142                 return -errno;
2143         }
2144
2145         nr = ioctl(control, LOOP_CTL_GET_FREE);
2146         if (nr < 0) {
2147                 log_error("Failed to allocate loop device: %m");
2148                 return -errno;
2149         }
2150
2151         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2152                 return log_oom();
2153
2154         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2155         if (loop < 0) {
2156                 log_error("Failed to open loop device %s: %m", loopdev);
2157                 return -errno;
2158         }
2159
2160         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2161                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2162                 return -errno;
2163         }
2164
2165         if (arg_read_only)
2166                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2167
2168         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2169                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2170                 return -errno;
2171         }
2172
2173         *device_path = loopdev;
2174         loopdev = NULL;
2175
2176         *loop_nr = nr;
2177
2178         r = loop;
2179         loop = -1;
2180
2181         return r;
2182 }
2183
2184 static int dissect_image(
2185                 int fd,
2186                 char **root_device, bool *root_device_rw,
2187                 char **home_device, bool *home_device_rw,
2188                 char **srv_device, bool *srv_device_rw,
2189                 bool *secondary) {
2190
2191 #ifdef HAVE_BLKID
2192         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2193         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2194         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2195         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2196         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2197         _cleanup_udev_unref_ struct udev *udev = NULL;
2198         struct udev_list_entry *first, *item;
2199         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2200         const char *pttype = NULL;
2201         blkid_partlist pl;
2202         struct stat st;
2203         int r;
2204
2205         assert(fd >= 0);
2206         assert(root_device);
2207         assert(home_device);
2208         assert(srv_device);
2209         assert(secondary);
2210
2211         b = blkid_new_probe();
2212         if (!b)
2213                 return log_oom();
2214
2215         errno = 0;
2216         r = blkid_probe_set_device(b, fd, 0, 0);
2217         if (r != 0) {
2218                 if (errno == 0)
2219                         return log_oom();
2220
2221                 log_error("Failed to set device on blkid probe: %m");
2222                 return -errno;
2223         }
2224
2225         blkid_probe_enable_partitions(b, 1);
2226         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2227
2228         errno = 0;
2229         r = blkid_do_safeprobe(b);
2230         if (r == -2 || r == 1) {
2231                 log_error("Failed to identify any partition table on %s.\n"
2232                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2233                 return -EINVAL;
2234         } else if (r != 0) {
2235                 if (errno == 0)
2236                         errno = EIO;
2237                 log_error("Failed to probe: %m");
2238                 return -errno;
2239         }
2240
2241         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2242         if (!streq_ptr(pttype, "gpt")) {
2243                 log_error("Image %s does not carry a GUID Partition Table.\n"
2244                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2245                 return -EINVAL;
2246         }
2247
2248         errno = 0;
2249         pl = blkid_probe_get_partitions(b);
2250         if (!pl) {
2251                 if (errno == 0)
2252                         return log_oom();
2253
2254                 log_error("Failed to list partitions of %s", arg_image);
2255                 return -errno;
2256         }
2257
2258         udev = udev_new();
2259         if (!udev)
2260                 return log_oom();
2261
2262         if (fstat(fd, &st) < 0) {
2263                 log_error("Failed to stat block device: %m");
2264                 return -errno;
2265         }
2266
2267         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2268         if (!d)
2269                 return log_oom();
2270
2271         e = udev_enumerate_new(udev);
2272         if (!e)
2273                 return log_oom();
2274
2275         r = udev_enumerate_add_match_parent(e, d);
2276         if (r < 0)
2277                 return log_oom();
2278
2279         r = udev_enumerate_scan_devices(e);
2280         if (r < 0) {
2281                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
2282                 return r;
2283         }
2284
2285         first = udev_enumerate_get_list_entry(e);
2286         udev_list_entry_foreach(item, first) {
2287                 _cleanup_udev_device_unref_ struct udev_device *q;
2288                 const char *stype, *node;
2289                 unsigned long long flags;
2290                 sd_id128_t type_id;
2291                 blkid_partition pp;
2292                 dev_t qn;
2293                 int nr;
2294
2295                 errno = 0;
2296                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2297                 if (!q) {
2298                         if (!errno)
2299                                 errno = ENOMEM;
2300
2301                         log_error("Failed to get partition device of %s: %m", arg_image);
2302                         return -errno;
2303                 }
2304
2305                 qn = udev_device_get_devnum(q);
2306                 if (major(qn) == 0)
2307                         continue;
2308
2309                 if (st.st_rdev == qn)
2310                         continue;
2311
2312                 node = udev_device_get_devnode(q);
2313                 if (!node)
2314                         continue;
2315
2316                 pp = blkid_partlist_devno_to_partition(pl, qn);
2317                 if (!pp)
2318                         continue;
2319
2320                 flags = blkid_partition_get_flags(pp);
2321                 if (flags & GPT_FLAG_NO_AUTO)
2322                         continue;
2323
2324                 nr = blkid_partition_get_partno(pp);
2325                 if (nr < 0)
2326                         continue;
2327
2328                 stype = blkid_partition_get_type_string(pp);
2329                 if (!stype)
2330                         continue;
2331
2332                 if (sd_id128_from_string(stype, &type_id) < 0)
2333                         continue;
2334
2335                 if (sd_id128_equal(type_id, GPT_HOME)) {
2336
2337                         if (home && nr >= home_nr)
2338                                 continue;
2339
2340                         home_nr = nr;
2341                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2342
2343                         free(home);
2344                         home = strdup(node);
2345                         if (!home)
2346                                 return log_oom();
2347                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2348
2349                         if (srv && nr >= srv_nr)
2350                                 continue;
2351
2352                         srv_nr = nr;
2353                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2354
2355                         free(srv);
2356                         srv = strdup(node);
2357                         if (!srv)
2358                                 return log_oom();
2359                 }
2360 #ifdef GPT_ROOT_NATIVE
2361                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2362
2363                         if (root && nr >= root_nr)
2364                                 continue;
2365
2366                         root_nr = nr;
2367                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2368
2369                         free(root);
2370                         root = strdup(node);
2371                         if (!root)
2372                                 return log_oom();
2373                 }
2374 #endif
2375 #ifdef GPT_ROOT_SECONDARY
2376                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2377
2378                         if (secondary_root && nr >= secondary_root_nr)
2379                                 continue;
2380
2381                         secondary_root_nr = nr;
2382                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2383
2384
2385                         free(secondary_root);
2386                         secondary_root = strdup(node);
2387                         if (!secondary_root)
2388                                 return log_oom();
2389                 }
2390 #endif
2391         }
2392
2393         if (!root && !secondary_root) {
2394                 log_error("Failed to identify root partition in disk image %s.\n"
2395                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2396                 return -EINVAL;
2397         }
2398
2399         if (root) {
2400                 *root_device = root;
2401                 root = NULL;
2402
2403                 *root_device_rw = root_rw;
2404                 *secondary = false;
2405         } else if (secondary_root) {
2406                 *root_device = secondary_root;
2407                 secondary_root = NULL;
2408
2409                 *root_device_rw = secondary_root_rw;
2410                 *secondary = true;
2411         }
2412
2413         if (home) {
2414                 *home_device = home;
2415                 home = NULL;
2416
2417                 *home_device_rw = home_rw;
2418         }
2419
2420         if (srv) {
2421                 *srv_device = srv;
2422                 srv = NULL;
2423
2424                 *srv_device_rw = srv_rw;
2425         }
2426
2427         return 0;
2428 #else
2429         log_error("--image= is not supported, compiled without blkid support.");
2430         return -ENOTSUP;
2431 #endif
2432 }
2433
2434 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2435 #ifdef HAVE_BLKID
2436         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2437         const char *fstype, *p;
2438         int r;
2439
2440         assert(what);
2441         assert(where);
2442
2443         if (arg_read_only)
2444                 rw = false;
2445
2446         if (directory)
2447                 p = strappenda(where, directory);
2448         else
2449                 p = where;
2450
2451         errno = 0;
2452         b = blkid_new_probe_from_filename(what);
2453         if (!b) {
2454                 if (errno == 0)
2455                         return log_oom();
2456                 log_error("Failed to allocate prober for %s: %m", what);
2457                 return -errno;
2458         }
2459
2460         blkid_probe_enable_superblocks(b, 1);
2461         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2462
2463         errno = 0;
2464         r = blkid_do_safeprobe(b);
2465         if (r == -1 || r == 1) {
2466                 log_error("Cannot determine file system type of %s", what);
2467                 return -EINVAL;
2468         } else if (r != 0) {
2469                 if (errno == 0)
2470                         errno = EIO;
2471                 log_error("Failed to probe %s: %m", what);
2472                 return -errno;
2473         }
2474
2475         errno = 0;
2476         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2477                 if (errno == 0)
2478                         errno = EINVAL;
2479                 log_error("Failed to determine file system type of %s", what);
2480                 return -errno;
2481         }
2482
2483         if (streq(fstype, "crypto_LUKS")) {
2484                 log_error("nspawn currently does not support LUKS disk images.");
2485                 return -ENOTSUP;
2486         }
2487
2488         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2489                 log_error("Failed to mount %s: %m", what);
2490                 return -errno;
2491         }
2492
2493         return 0;
2494 #else
2495         log_error("--image= is not supported, compiled without blkid support.");
2496         return -ENOTSUP;
2497 #endif
2498 }
2499
2500 static int mount_devices(
2501                 const char *where,
2502                 const char *root_device, bool root_device_rw,
2503                 const char *home_device, bool home_device_rw,
2504                 const char *srv_device, bool srv_device_rw) {
2505         int r;
2506
2507         assert(where);
2508
2509         if (root_device) {
2510                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2511                 if (r < 0) {
2512                         log_error("Failed to mount root directory: %s", strerror(-r));
2513                         return r;
2514                 }
2515         }
2516
2517         if (home_device) {
2518                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2519                 if (r < 0) {
2520                         log_error("Failed to mount home directory: %s", strerror(-r));
2521                         return r;
2522                 }
2523         }
2524
2525         if (srv_device) {
2526                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2527                 if (r < 0) {
2528                         log_error("Failed to mount server data directory: %s", strerror(-r));
2529                         return r;
2530                 }
2531         }
2532
2533         return 0;
2534 }
2535
2536 static void loop_remove(int nr, int *image_fd) {
2537         _cleanup_close_ int control = -1;
2538
2539         if (nr < 0)
2540                 return;
2541
2542         if (image_fd && *image_fd >= 0) {
2543                 ioctl(*image_fd, LOOP_CLR_FD);
2544                 *image_fd = safe_close(*image_fd);
2545         }
2546
2547         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2548         if (control < 0)
2549                 return;
2550
2551         ioctl(control, LOOP_CTL_REMOVE, nr);
2552 }
2553
2554 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2555         int pipe_fds[2];
2556         pid_t pid;
2557
2558         assert(database);
2559         assert(key);
2560         assert(rpid);
2561
2562         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2563                 log_error("Failed to allocate pipe: %m");
2564                 return -errno;
2565         }
2566
2567         pid = fork();
2568         if (pid < 0) {
2569                 log_error("Failed to fork getent child: %m");
2570                 return -errno;
2571         } else if (pid == 0) {
2572                 int nullfd;
2573                 char *empty_env = NULL;
2574
2575                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2576                         _exit(EXIT_FAILURE);
2577
2578                 if (pipe_fds[0] > 2)
2579                         safe_close(pipe_fds[0]);
2580                 if (pipe_fds[1] > 2)
2581                         safe_close(pipe_fds[1]);
2582
2583                 nullfd = open("/dev/null", O_RDWR);
2584                 if (nullfd < 0)
2585                         _exit(EXIT_FAILURE);
2586
2587                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2588                         _exit(EXIT_FAILURE);
2589
2590                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2591                         _exit(EXIT_FAILURE);
2592
2593                 if (nullfd > 2)
2594                         safe_close(nullfd);
2595
2596                 reset_all_signal_handlers();
2597                 close_all_fds(NULL, 0);
2598
2599                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2600                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2601                 _exit(EXIT_FAILURE);
2602         }
2603
2604         pipe_fds[1] = safe_close(pipe_fds[1]);
2605
2606         *rpid = pid;
2607
2608         return pipe_fds[0];
2609 }
2610
2611 static int change_uid_gid(char **_home) {
2612         char line[LINE_MAX], *x, *u, *g, *h;
2613         const char *word, *state;
2614         _cleanup_free_ uid_t *uids = NULL;
2615         _cleanup_free_ char *home = NULL;
2616         _cleanup_fclose_ FILE *f = NULL;
2617         _cleanup_close_ int fd = -1;
2618         unsigned n_uids = 0;
2619         size_t sz = 0, l;
2620         uid_t uid;
2621         gid_t gid;
2622         pid_t pid;
2623         int r;
2624
2625         assert(_home);
2626
2627         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2628                 /* Reset everything fully to 0, just in case */
2629
2630                 if (setgroups(0, NULL) < 0) {
2631                         log_error("setgroups() failed: %m");
2632                         return -errno;
2633                 }
2634
2635                 if (setresgid(0, 0, 0) < 0) {
2636                         log_error("setregid() failed: %m");
2637                         return -errno;
2638                 }
2639
2640                 if (setresuid(0, 0, 0) < 0) {
2641                         log_error("setreuid() failed: %m");
2642                         return -errno;
2643                 }
2644
2645                 *_home = NULL;
2646                 return 0;
2647         }
2648
2649         /* First, get user credentials */
2650         fd = spawn_getent("passwd", arg_user, &pid);
2651         if (fd < 0)
2652                 return fd;
2653
2654         f = fdopen(fd, "r");
2655         if (!f)
2656                 return log_oom();
2657         fd = -1;
2658
2659         if (!fgets(line, sizeof(line), f)) {
2660
2661                 if (!ferror(f)) {
2662                         log_error("Failed to resolve user %s.", arg_user);
2663                         return -ESRCH;
2664                 }
2665
2666                 log_error("Failed to read from getent: %m");
2667                 return -errno;
2668         }
2669
2670         truncate_nl(line);
2671
2672         wait_for_terminate_and_warn("getent passwd", pid);
2673
2674         x = strchr(line, ':');
2675         if (!x) {
2676                 log_error("/etc/passwd entry has invalid user field.");
2677                 return -EIO;
2678         }
2679
2680         u = strchr(x+1, ':');
2681         if (!u) {
2682                 log_error("/etc/passwd entry has invalid password field.");
2683                 return -EIO;
2684         }
2685
2686         u++;
2687         g = strchr(u, ':');
2688         if (!g) {
2689                 log_error("/etc/passwd entry has invalid UID field.");
2690                 return -EIO;
2691         }
2692
2693         *g = 0;
2694         g++;
2695         x = strchr(g, ':');
2696         if (!x) {
2697                 log_error("/etc/passwd entry has invalid GID field.");
2698                 return -EIO;
2699         }
2700
2701         *x = 0;
2702         h = strchr(x+1, ':');
2703         if (!h) {
2704                 log_error("/etc/passwd entry has invalid GECOS field.");
2705                 return -EIO;
2706         }
2707
2708         h++;
2709         x = strchr(h, ':');
2710         if (!x) {
2711                 log_error("/etc/passwd entry has invalid home directory field.");
2712                 return -EIO;
2713         }
2714
2715         *x = 0;
2716
2717         r = parse_uid(u, &uid);
2718         if (r < 0) {
2719                 log_error("Failed to parse UID of user.");
2720                 return -EIO;
2721         }
2722
2723         r = parse_gid(g, &gid);
2724         if (r < 0) {
2725                 log_error("Failed to parse GID of user.");
2726                 return -EIO;
2727         }
2728
2729         home = strdup(h);
2730         if (!home)
2731                 return log_oom();
2732
2733         /* Second, get group memberships */
2734         fd = spawn_getent("initgroups", arg_user, &pid);
2735         if (fd < 0)
2736                 return fd;
2737
2738         fclose(f);
2739         f = fdopen(fd, "r");
2740         if (!f)
2741                 return log_oom();
2742         fd = -1;
2743
2744         if (!fgets(line, sizeof(line), f)) {
2745                 if (!ferror(f)) {
2746                         log_error("Failed to resolve user %s.", arg_user);
2747                         return -ESRCH;
2748                 }
2749
2750                 log_error("Failed to read from getent: %m");
2751                 return -errno;
2752         }
2753
2754         truncate_nl(line);
2755
2756         wait_for_terminate_and_warn("getent initgroups", pid);
2757
2758         /* Skip over the username and subsequent separator whitespace */
2759         x = line;
2760         x += strcspn(x, WHITESPACE);
2761         x += strspn(x, WHITESPACE);
2762
2763         FOREACH_WORD(word, l, x, state) {
2764                 char c[l+1];
2765
2766                 memcpy(c, word, l);
2767                 c[l] = 0;
2768
2769                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2770                         return log_oom();
2771
2772                 r = parse_uid(c, &uids[n_uids++]);
2773                 if (r < 0) {
2774                         log_error("Failed to parse group data from getent.");
2775                         return -EIO;
2776                 }
2777         }
2778
2779         r = mkdir_parents(home, 0775);
2780         if (r < 0) {
2781                 log_error("Failed to make home root directory: %s", strerror(-r));
2782                 return r;
2783         }
2784
2785         r = mkdir_safe(home, 0755, uid, gid);
2786         if (r < 0 && r != -EEXIST) {
2787                 log_error("Failed to make home directory: %s", strerror(-r));
2788                 return r;
2789         }
2790
2791         fchown(STDIN_FILENO, uid, gid);
2792         fchown(STDOUT_FILENO, uid, gid);
2793         fchown(STDERR_FILENO, uid, gid);
2794
2795         if (setgroups(n_uids, uids) < 0) {
2796                 log_error("Failed to set auxiliary groups: %m");
2797                 return -errno;
2798         }
2799
2800         if (setresgid(gid, gid, gid) < 0) {
2801                 log_error("setregid() failed: %m");
2802                 return -errno;
2803         }
2804
2805         if (setresuid(uid, uid, uid) < 0) {
2806                 log_error("setreuid() failed: %m");
2807                 return -errno;
2808         }
2809
2810         if (_home) {
2811                 *_home = home;
2812                 home = NULL;
2813         }
2814
2815         return 0;
2816 }
2817
2818 /*
2819  * Return values:
2820  * < 0 : wait_for_terminate() failed to get the state of the
2821  *       container, the container was terminated by a signal, or
2822  *       failed for an unknown reason.  No change is made to the
2823  *       container argument.
2824  * > 0 : The program executed in the container terminated with an
2825  *       error.  The exit code of the program executed in the
2826  *       container is returned.  No change is made to the container
2827  *       argument.
2828  *   0 : The container is being rebooted, has been shut down or exited
2829  *       successfully.  The container argument has been set to either
2830  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2831  *
2832  * That is, success is indicated by a return value of zero, and an
2833  * error is indicated by a non-zero value.
2834  */
2835 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2836         int r;
2837         siginfo_t status;
2838
2839         r = wait_for_terminate(pid, &status);
2840         if (r < 0) {
2841                 log_warning("Failed to wait for container: %s", strerror(-r));
2842                 return r;
2843         }
2844
2845         switch (status.si_code) {
2846         case CLD_EXITED:
2847                 r = status.si_status;
2848                 if (r == 0) {
2849                         if (!arg_quiet)
2850                                 log_debug("Container %s exited successfully.",
2851                                           arg_machine);
2852
2853                         *container = CONTAINER_TERMINATED;
2854                 } else {
2855                         log_error("Container %s failed with error code %i.",
2856                                   arg_machine, status.si_status);
2857                 }
2858                 break;
2859
2860         case CLD_KILLED:
2861                 if (status.si_status == SIGINT) {
2862                         if (!arg_quiet)
2863                                 log_info("Container %s has been shut down.",
2864                                          arg_machine);
2865
2866                         *container = CONTAINER_TERMINATED;
2867                         r = 0;
2868                         break;
2869                 } else if (status.si_status == SIGHUP) {
2870                         if (!arg_quiet)
2871                                 log_info("Container %s is being rebooted.",
2872                                          arg_machine);
2873
2874                         *container = CONTAINER_REBOOTED;
2875                         r = 0;
2876                         break;
2877                 }
2878                 /* CLD_KILLED fallthrough */
2879
2880         case CLD_DUMPED:
2881                 log_error("Container %s terminated by signal %s.",
2882                           arg_machine, signal_to_string(status.si_status));
2883                 r = -1;
2884                 break;
2885
2886         default:
2887                 log_error("Container %s failed due to unknown reason.",
2888                           arg_machine);
2889                 r = -1;
2890                 break;
2891         }
2892
2893         return r;
2894 }
2895
2896 static void nop_handler(int sig) {}
2897
2898 int main(int argc, char *argv[]) {
2899
2900         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2901         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2902         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2903         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2904         _cleanup_fdset_free_ FDSet *fds = NULL;
2905         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2906         const char *console = NULL;
2907         char veth_name[IFNAMSIZ];
2908         bool secondary = false;
2909         sigset_t mask, mask_chld;
2910         pid_t pid = 0;
2911
2912         log_parse_environment();
2913         log_open();
2914
2915         k = parse_argv(argc, argv);
2916         if (k < 0)
2917                 goto finish;
2918         else if (k == 0) {
2919                 r = EXIT_SUCCESS;
2920                 goto finish;
2921         }
2922
2923         if (!arg_image) {
2924                 if (arg_directory) {
2925                         char *p;
2926
2927                         p = path_make_absolute_cwd(arg_directory);
2928                         free(arg_directory);
2929                         arg_directory = p;
2930                 } else
2931                         arg_directory = get_current_dir_name();
2932
2933                 if (!arg_directory) {
2934                         log_error("Failed to determine path, please use -D.");
2935                         goto finish;
2936                 }
2937                 path_kill_slashes(arg_directory);
2938         }
2939
2940         if (!arg_machine) {
2941                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2942                 if (!arg_machine) {
2943                         log_oom();
2944                         goto finish;
2945                 }
2946
2947                 hostname_cleanup(arg_machine, false);
2948                 if (isempty(arg_machine)) {
2949                         log_error("Failed to determine machine name automatically, please use -M.");
2950                         goto finish;
2951                 }
2952         }
2953
2954         if (geteuid() != 0) {
2955                 log_error("Need to be root.");
2956                 goto finish;
2957         }
2958
2959         if (sd_booted() <= 0) {
2960                 log_error("Not running on a systemd system.");
2961                 goto finish;
2962         }
2963
2964         log_close();
2965         n_fd_passed = sd_listen_fds(false);
2966         if (n_fd_passed > 0) {
2967                 k = fdset_new_listen_fds(&fds, false);
2968                 if (k < 0) {
2969                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2970                         goto finish;
2971                 }
2972         }
2973         fdset_close_others(fds);
2974         log_open();
2975
2976         if (arg_directory) {
2977                 if (path_equal(arg_directory, "/")) {
2978                         log_error("Spawning container on root directory not supported.");
2979                         goto finish;
2980                 }
2981
2982                 if (arg_boot) {
2983                         if (path_is_os_tree(arg_directory) <= 0) {
2984                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2985                                 goto finish;
2986                         }
2987                 } else {
2988                         const char *p;
2989
2990                         p = strappenda(arg_directory,
2991                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2992                         if (access(p, F_OK) < 0) {
2993                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2994                                 goto finish;
2995
2996                         }
2997                 }
2998         } else {
2999                 char template[] = "/tmp/nspawn-root-XXXXXX";
3000
3001                 if (!mkdtemp(template)) {
3002                         log_error("Failed to create temporary directory: %m");
3003                         r = -errno;
3004                         goto finish;
3005                 }
3006
3007                 arg_directory = strdup(template);
3008                 if (!arg_directory) {
3009                         r = log_oom();
3010                         goto finish;
3011                 }
3012
3013                 image_fd = setup_image(&device_path, &loop_nr);
3014                 if (image_fd < 0) {
3015                         r = image_fd;
3016                         goto finish;
3017                 }
3018
3019                 r = dissect_image(image_fd,
3020                                   &root_device, &root_device_rw,
3021                                   &home_device, &home_device_rw,
3022                                   &srv_device, &srv_device_rw,
3023                                   &secondary);
3024                 if (r < 0)
3025                         goto finish;
3026         }
3027
3028         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
3029         if (master < 0) {
3030                 log_error("Failed to acquire pseudo tty: %m");
3031                 goto finish;
3032         }
3033
3034         console = ptsname(master);
3035         if (!console) {
3036                 log_error("Failed to determine tty name: %m");
3037                 goto finish;
3038         }
3039
3040         if (!arg_quiet)
3041                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
3042                          arg_machine, arg_image ? arg_image : arg_directory);
3043
3044         if (unlockpt(master) < 0) {
3045                 log_error("Failed to unlock tty: %m");
3046                 goto finish;
3047         }
3048
3049         if (access("/dev/kdbus/control", F_OK) >= 0) {
3050
3051                 if (arg_share_system) {
3052                         kdbus_domain = strdup("/dev/kdbus");
3053                         if (!kdbus_domain) {
3054                                 log_oom();
3055                                 goto finish;
3056                         }
3057                 } else {
3058                         const char *ns;
3059
3060                         ns = strappenda("machine-", arg_machine);
3061                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
3062                         if (r < 0)
3063                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
3064                         else
3065                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
3066                 }
3067         }
3068
3069         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3070                 log_error("Failed to create kmsg socket pair: %m");
3071                 goto finish;
3072         }
3073
3074         sd_notify(false,
3075                   "READY=1\n"
3076                   "STATUS=Container running.");
3077
3078         assert_se(sigemptyset(&mask) == 0);
3079         assert_se(sigemptyset(&mask_chld) == 0);
3080         sigaddset(&mask_chld, SIGCHLD);
3081         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3082         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3083
3084         for (;;) {
3085                 ContainerStatus container_status;
3086                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3087                 struct sigaction sa = {
3088                         .sa_handler = nop_handler,
3089                         .sa_flags = SA_NOCLDSTOP,
3090                 };
3091
3092                 r = barrier_create(&barrier);
3093                 if (r < 0) {
3094                         log_error("Cannot initialize IPC barrier: %s", strerror(-r));
3095                         goto finish;
3096                 }
3097
3098                 /* Child can be killed before execv(), so handle SIGCHLD
3099                  * in order to interrupt parent's blocking calls and
3100                  * give it a chance to call wait() and terminate. */
3101                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3102                 if (r < 0) {
3103                         log_error("Failed to change the signal mask: %m");
3104                         goto finish;
3105                 }
3106
3107                 r = sigaction(SIGCHLD, &sa, NULL);
3108                 if (r < 0) {
3109                         log_error("Failed to install SIGCHLD handler: %m");
3110                         goto finish;
3111                 }
3112
3113                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3114                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3115                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3116                 if (pid < 0) {
3117                         if (errno == EINVAL)
3118                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3119                         else
3120                                 log_error("clone() failed: %m");
3121
3122                         r = pid;
3123                         goto finish;
3124                 }
3125
3126                 if (pid == 0) {
3127                         /* child */
3128                         _cleanup_free_ char *home = NULL;
3129                         unsigned n_env = 2;
3130                         const char *envp[] = {
3131                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3132                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3133                                 NULL, /* TERM */
3134                                 NULL, /* HOME */
3135                                 NULL, /* USER */
3136                                 NULL, /* LOGNAME */
3137                                 NULL, /* container_uuid */
3138                                 NULL, /* LISTEN_FDS */
3139                                 NULL, /* LISTEN_PID */
3140                                 NULL
3141                         };
3142                         char **env_use;
3143
3144                         barrier_set_role(&barrier, BARRIER_CHILD);
3145
3146                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3147                         if (envp[n_env])
3148                                 n_env ++;
3149
3150                         master = safe_close(master);
3151
3152                         close_nointr(STDIN_FILENO);
3153                         close_nointr(STDOUT_FILENO);
3154                         close_nointr(STDERR_FILENO);
3155
3156                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3157
3158                         reset_all_signal_handlers();
3159                         reset_signal_mask();
3160
3161                         k = open_terminal(console, O_RDWR);
3162                         if (k != STDIN_FILENO) {
3163                                 if (k >= 0) {
3164                                         safe_close(k);
3165                                         k = -EINVAL;
3166                                 }
3167
3168                                 log_error("Failed to open console: %s", strerror(-k));
3169                                 _exit(EXIT_FAILURE);
3170                         }
3171
3172                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3173                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3174                                 log_error("Failed to duplicate console: %m");
3175                                 _exit(EXIT_FAILURE);
3176                         }
3177
3178                         if (setsid() < 0) {
3179                                 log_error("setsid() failed: %m");
3180                                 _exit(EXIT_FAILURE);
3181                         }
3182
3183                         if (reset_audit_loginuid() < 0)
3184                                 _exit(EXIT_FAILURE);
3185
3186                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3187                                 log_error("PR_SET_PDEATHSIG failed: %m");
3188                                 _exit(EXIT_FAILURE);
3189                         }
3190
3191                         /* Mark everything as slave, so that we still
3192                          * receive mounts from the real root, but don't
3193                          * propagate mounts to the real root. */
3194                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3195                                 log_error("MS_SLAVE|MS_REC failed: %m");
3196                                 _exit(EXIT_FAILURE);
3197                         }
3198
3199                         if (mount_devices(arg_directory,
3200                                           root_device, root_device_rw,
3201                                           home_device, home_device_rw,
3202                                           srv_device, srv_device_rw) < 0)
3203                                 _exit(EXIT_FAILURE);
3204
3205                         /* Turn directory into bind mount */
3206                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3207                                 log_error("Failed to make bind mount: %m");
3208                                 _exit(EXIT_FAILURE);
3209                         }
3210
3211                         r = setup_volatile(arg_directory);
3212                         if (r < 0)
3213                                 _exit(EXIT_FAILURE);
3214
3215                         if (setup_volatile_state(arg_directory) < 0)
3216                                 _exit(EXIT_FAILURE);
3217
3218                         r = base_filesystem_create(arg_directory);
3219                         if (r < 0)
3220                                 _exit(EXIT_FAILURE);
3221
3222                         if (arg_read_only) {
3223                                 k = bind_remount_recursive(arg_directory, true);
3224                                 if (k < 0) {
3225                                         log_error("Failed to make tree read-only: %s", strerror(-k));
3226                                         _exit(EXIT_FAILURE);
3227                                 }
3228                         }
3229
3230                         if (mount_all(arg_directory) < 0)
3231                                 _exit(EXIT_FAILURE);
3232
3233                         if (copy_devnodes(arg_directory) < 0)
3234                                 _exit(EXIT_FAILURE);
3235
3236                         if (setup_ptmx(arg_directory) < 0)
3237                                 _exit(EXIT_FAILURE);
3238
3239                         dev_setup(arg_directory);
3240
3241                         if (setup_seccomp() < 0)
3242                                 _exit(EXIT_FAILURE);
3243
3244                         if (setup_dev_console(arg_directory, console) < 0)
3245                                 _exit(EXIT_FAILURE);
3246
3247                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
3248                                 _exit(EXIT_FAILURE);
3249
3250                         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
3251
3252                         if (setup_boot_id(arg_directory) < 0)
3253                                 _exit(EXIT_FAILURE);
3254
3255                         if (setup_timezone(arg_directory) < 0)
3256                                 _exit(EXIT_FAILURE);
3257
3258                         if (setup_resolv_conf(arg_directory) < 0)
3259                                 _exit(EXIT_FAILURE);
3260
3261                         if (setup_journal(arg_directory) < 0)
3262                                 _exit(EXIT_FAILURE);
3263
3264                         if (mount_binds(arg_directory, arg_bind, false) < 0)
3265                                 _exit(EXIT_FAILURE);
3266
3267                         if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
3268                                 _exit(EXIT_FAILURE);
3269
3270                         if (mount_tmpfs(arg_directory) < 0)
3271                                 _exit(EXIT_FAILURE);
3272
3273                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
3274                                 _exit(EXIT_FAILURE);
3275
3276                         /* Tell the parent that we are ready, and that
3277                          * it can cgroupify us to that we lack access
3278                          * to certain devices and resources. */
3279                         barrier_place(&barrier);
3280
3281                         if (chdir(arg_directory) < 0) {
3282                                 log_error("chdir(%s) failed: %m", arg_directory);
3283                                 _exit(EXIT_FAILURE);
3284                         }
3285
3286                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
3287                                 log_error("mount(MS_MOVE) failed: %m");
3288                                 _exit(EXIT_FAILURE);
3289                         }
3290
3291                         if (chroot(".") < 0) {
3292                                 log_error("chroot() failed: %m");
3293                                 _exit(EXIT_FAILURE);
3294                         }
3295
3296                         if (chdir("/") < 0) {
3297                                 log_error("chdir() failed: %m");
3298                                 _exit(EXIT_FAILURE);
3299                         }
3300
3301                         umask(0022);
3302
3303                         if (arg_private_network)
3304                                 loopback_setup();
3305
3306                         if (drop_capabilities() < 0) {
3307                                 log_error("drop_capabilities() failed: %m");
3308                                 _exit(EXIT_FAILURE);
3309                         }
3310
3311                         r = change_uid_gid(&home);
3312                         if (r < 0)
3313                                 _exit(EXIT_FAILURE);
3314
3315                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
3316                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
3317                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
3318                                 log_oom();
3319                                 _exit(EXIT_FAILURE);
3320                         }
3321
3322                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
3323                                 char as_uuid[37];
3324
3325                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", id128_format_as_uuid(arg_uuid, as_uuid)) < 0) {
3326                                         log_oom();
3327                                         _exit(EXIT_FAILURE);
3328                                 }
3329                         }
3330
3331                         if (fdset_size(fds) > 0) {
3332                                 k = fdset_cloexec(fds, false);
3333                                 if (k < 0) {
3334                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
3335                                         _exit(EXIT_FAILURE);
3336                                 }
3337
3338                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
3339                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
3340                                         log_oom();
3341                                         _exit(EXIT_FAILURE);
3342                                 }
3343                         }
3344
3345                         setup_hostname();
3346
3347                         if (arg_personality != 0xffffffffLU) {
3348                                 if (personality(arg_personality) < 0) {
3349                                         log_error("personality() failed: %m");
3350                                         _exit(EXIT_FAILURE);
3351                                 }
3352                         } else if (secondary) {
3353                                 if (personality(PER_LINUX32) < 0) {
3354                                         log_error("personality() failed: %m");
3355                                         _exit(EXIT_FAILURE);
3356                                 }
3357                         }
3358
3359 #ifdef HAVE_SELINUX
3360                         if (arg_selinux_context)
3361                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
3362                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
3363                                         _exit(EXIT_FAILURE);
3364                                 }
3365 #endif
3366
3367                         if (!strv_isempty(arg_setenv)) {
3368                                 char **n;
3369
3370                                 n = strv_env_merge(2, envp, arg_setenv);
3371                                 if (!n) {
3372                                         log_oom();
3373                                         _exit(EXIT_FAILURE);
3374                                 }
3375
3376                                 env_use = n;
3377                         } else
3378                                 env_use = (char**) envp;
3379
3380                         /* Wait until the parent is ready with the setup, too... */
3381                         if (!barrier_place_and_sync(&barrier))
3382                                 _exit(EXIT_FAILURE);
3383
3384                         if (arg_boot) {
3385                                 char **a;
3386                                 size_t l;
3387
3388                                 /* Automatically search for the init system */
3389
3390                                 l = 1 + argc - optind;
3391                                 a = newa(char*, l + 1);
3392                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
3393
3394                                 a[0] = (char*) "/usr/lib/systemd/systemd";
3395                                 execve(a[0], a, env_use);
3396
3397                                 a[0] = (char*) "/lib/systemd/systemd";
3398                                 execve(a[0], a, env_use);
3399
3400                                 a[0] = (char*) "/sbin/init";
3401                                 execve(a[0], a, env_use);
3402                         } else if (argc > optind)
3403                                 execvpe(argv[optind], argv + optind, env_use);
3404                         else {
3405                                 chdir(home ? home : "/root");
3406                                 execle("/bin/bash", "-bash", NULL, env_use);
3407                                 execle("/bin/sh", "-sh", NULL, env_use);
3408                         }
3409
3410                         log_error("execv() failed: %m");
3411                         _exit(EXIT_FAILURE);
3412                 }
3413
3414                 barrier_set_role(&barrier, BARRIER_PARENT);
3415                 fdset_free(fds);
3416                 fds = NULL;
3417
3418                 /* wait for child-setup to be done */
3419                 if (barrier_place_and_sync(&barrier)) {
3420                         int ifi = 0;
3421
3422                         r = move_network_interfaces(pid);
3423                         if (r < 0)
3424                                 goto finish;
3425
3426                         r = setup_veth(pid, veth_name, &ifi);
3427                         if (r < 0)
3428                                 goto finish;
3429
3430                         r = setup_bridge(veth_name, &ifi);
3431                         if (r < 0)
3432                                 goto finish;
3433
3434                         r = setup_macvlan(pid);
3435                         if (r < 0)
3436                                 goto finish;
3437
3438                         r = register_machine(pid, ifi);
3439                         if (r < 0)
3440                                 goto finish;
3441
3442                         /* Block SIGCHLD here, before notifying child.
3443                          * process_pty() will handle it with the other signals. */
3444                         r = sigprocmask(SIG_BLOCK, &mask_chld, NULL);
3445                         if (r < 0)
3446                                 goto finish;
3447
3448                         /* Reset signal to default */
3449                         r = default_signals(SIGCHLD, -1);
3450                         if (r < 0)
3451                                 goto finish;
3452
3453                         /* Notify the child that the parent is ready with all
3454                          * its setup, and that the child can now hand over
3455                          * control to the code to run inside the container. */
3456                         barrier_place(&barrier);
3457
3458                         k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3459                         if (k < 0) {
3460                                 r = EXIT_FAILURE;
3461                                 break;
3462                         }
3463
3464                         if (!arg_quiet)
3465                                 putc('\n', stdout);
3466
3467                         /* Kill if it is not dead yet anyway */
3468                         terminate_machine(pid);
3469                 }
3470
3471                 /* Normally redundant, but better safe than sorry */
3472                 kill(pid, SIGKILL);
3473
3474                 r = wait_for_container(pid, &container_status);
3475                 pid = 0;
3476
3477                 if (r < 0) {
3478                         /* We failed to wait for the container, or the
3479                          * container exited abnormally */
3480                         r = EXIT_FAILURE;
3481                         break;
3482                 } else if (r > 0 || container_status == CONTAINER_TERMINATED)
3483                         /* The container exited with a non-zero
3484                          * status, or with zero status and no reboot
3485                          * was requested. */
3486                         break;
3487
3488                 /* CONTAINER_REBOOTED, loop again */
3489
3490                 if (arg_keep_unit) {
3491                         /* Special handling if we are running as a
3492                          * service: instead of simply restarting the
3493                          * machine we want to restart the entire
3494                          * service, so let's inform systemd about this
3495                          * with the special exit code 133. The service
3496                          * file uses RestartForceExitStatus=133 so
3497                          * that this results in a full nspawn
3498                          * restart. This is necessary since we might
3499                          * have cgroup parameters set we want to have
3500                          * flushed out. */
3501                         r = 133;
3502                         break;
3503                 }
3504         }
3505
3506 finish:
3507         sd_notify(false,
3508                   "STOPPING=1\n"
3509                   "STATUS=Terminating...");
3510
3511         loop_remove(loop_nr, &image_fd);
3512
3513         if (pid > 0)
3514                 kill(pid, SIGKILL);
3515
3516         free(arg_directory);
3517         free(arg_machine);
3518         free(arg_user);
3519         strv_free(arg_setenv);
3520         strv_free(arg_network_interfaces);
3521         strv_free(arg_network_macvlan);
3522         strv_free(arg_bind);
3523         strv_free(arg_bind_ro);
3524         strv_free(arg_tmpfs);
3525
3526         return r;
3527 }