chiark / gitweb /
util: introduce our own gperf based capability list
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93 #include "cap-list.h"
94
95 #ifdef HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98
99 typedef enum ContainerStatus {
100         CONTAINER_TERMINATED,
101         CONTAINER_REBOOTED
102 } ContainerStatus;
103
104 typedef enum LinkJournal {
105         LINK_NO,
106         LINK_AUTO,
107         LINK_HOST,
108         LINK_GUEST
109 } LinkJournal;
110
111 typedef enum Volatile {
112         VOLATILE_NO,
113         VOLATILE_YES,
114         VOLATILE_STATE,
115 } Volatile;
116
117 static char *arg_directory = NULL;
118 static char *arg_user = NULL;
119 static sd_id128_t arg_uuid = {};
120 static char *arg_machine = NULL;
121 static const char *arg_selinux_context = NULL;
122 static const char *arg_selinux_apifs_context = NULL;
123 static const char *arg_slice = NULL;
124 static bool arg_private_network = false;
125 static bool arg_read_only = false;
126 static bool arg_boot = false;
127 static LinkJournal arg_link_journal = LINK_AUTO;
128 static bool arg_link_journal_try = false;
129 static uint64_t arg_retain =
130         (1ULL << CAP_CHOWN) |
131         (1ULL << CAP_DAC_OVERRIDE) |
132         (1ULL << CAP_DAC_READ_SEARCH) |
133         (1ULL << CAP_FOWNER) |
134         (1ULL << CAP_FSETID) |
135         (1ULL << CAP_IPC_OWNER) |
136         (1ULL << CAP_KILL) |
137         (1ULL << CAP_LEASE) |
138         (1ULL << CAP_LINUX_IMMUTABLE) |
139         (1ULL << CAP_NET_BIND_SERVICE) |
140         (1ULL << CAP_NET_BROADCAST) |
141         (1ULL << CAP_NET_RAW) |
142         (1ULL << CAP_SETGID) |
143         (1ULL << CAP_SETFCAP) |
144         (1ULL << CAP_SETPCAP) |
145         (1ULL << CAP_SETUID) |
146         (1ULL << CAP_SYS_ADMIN) |
147         (1ULL << CAP_SYS_CHROOT) |
148         (1ULL << CAP_SYS_NICE) |
149         (1ULL << CAP_SYS_PTRACE) |
150         (1ULL << CAP_SYS_TTY_CONFIG) |
151         (1ULL << CAP_SYS_RESOURCE) |
152         (1ULL << CAP_SYS_BOOT) |
153         (1ULL << CAP_AUDIT_WRITE) |
154         (1ULL << CAP_AUDIT_CONTROL) |
155         (1ULL << CAP_MKNOD);
156 static char **arg_bind = NULL;
157 static char **arg_bind_ro = NULL;
158 static char **arg_tmpfs = NULL;
159 static char **arg_setenv = NULL;
160 static bool arg_quiet = false;
161 static bool arg_share_system = false;
162 static bool arg_register = true;
163 static bool arg_keep_unit = false;
164 static char **arg_network_interfaces = NULL;
165 static char **arg_network_macvlan = NULL;
166 static bool arg_network_veth = false;
167 static const char *arg_network_bridge = NULL;
168 static unsigned long arg_personality = 0xffffffffLU;
169 static const char *arg_image = NULL;
170 static Volatile arg_volatile = VOLATILE_NO;
171
172 static void help(void) {
173         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
174                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
175                "  -h --help                 Show this help\n"
176                "     --version              Print version string\n"
177                "  -q --quiet                Do not show status information\n"
178                "  -D --directory=PATH       Root directory for the container\n"
179                "  -i --image=PATH           File system device or image for the container\n"
180                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
181                "  -u --user=USER            Run the command under specified user or uid\n"
182                "  -M --machine=NAME         Set the machine name for the container\n"
183                "     --uuid=UUID            Set a specific machine UUID for the container\n"
184                "  -S --slice=SLICE          Place the container in the specified slice\n"
185                "     --private-network      Disable network in container\n"
186                "     --network-interface=INTERFACE\n"
187                "                            Assign an existing network interface to the\n"
188                "                            container\n"
189                "     --network-macvlan=INTERFACE\n"
190                "                            Create a macvlan network interface based on an\n"
191                "                            existing network interface to the container\n"
192                "     --network-veth         Add a virtual ethernet connection between host\n"
193                "                            and container\n"
194                "     --network-bridge=INTERFACE\n"
195                "                            Add a virtual ethernet connection between host\n"
196                "                            and container and add it to an existing bridge on\n"
197                "                            the host\n"
198                "  -Z --selinux-context=SECLABEL\n"
199                "                            Set the SELinux security context to be used by\n"
200                "                            processes in the container\n"
201                "  -L --selinux-apifs-context=SECLABEL\n"
202                "                            Set the SELinux security context to be used by\n"
203                "                            API/tmpfs file systems in the container\n"
204                "     --capability=CAP       In addition to the default, retain specified\n"
205                "                            capability\n"
206                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
207                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
208                "                            try-guest, try-host\n"
209                "  -j                        Equivalent to --link-journal=try-guest\n"
210                "     --read-only            Mount the root directory read-only\n"
211                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
212                "                            the container\n"
213                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
214                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
215                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
216                "     --share-system         Share system namespaces with host\n"
217                "     --register=BOOLEAN     Register container as machine\n"
218                "     --keep-unit            Do not register a scope for the machine, reuse\n"
219                "                            the service unit nspawn is running in\n"
220                "     --volatile[=MODE]      Run the system in volatile mode\n",
221                program_invocation_short_name);
222 }
223
224 static int parse_argv(int argc, char *argv[]) {
225
226         enum {
227                 ARG_VERSION = 0x100,
228                 ARG_PRIVATE_NETWORK,
229                 ARG_UUID,
230                 ARG_READ_ONLY,
231                 ARG_CAPABILITY,
232                 ARG_DROP_CAPABILITY,
233                 ARG_LINK_JOURNAL,
234                 ARG_BIND,
235                 ARG_BIND_RO,
236                 ARG_TMPFS,
237                 ARG_SETENV,
238                 ARG_SHARE_SYSTEM,
239                 ARG_REGISTER,
240                 ARG_KEEP_UNIT,
241                 ARG_NETWORK_INTERFACE,
242                 ARG_NETWORK_MACVLAN,
243                 ARG_NETWORK_VETH,
244                 ARG_NETWORK_BRIDGE,
245                 ARG_PERSONALITY,
246                 ARG_VOLATILE,
247         };
248
249         static const struct option options[] = {
250                 { "help",                  no_argument,       NULL, 'h'                   },
251                 { "version",               no_argument,       NULL, ARG_VERSION           },
252                 { "directory",             required_argument, NULL, 'D'                   },
253                 { "user",                  required_argument, NULL, 'u'                   },
254                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
255                 { "boot",                  no_argument,       NULL, 'b'                   },
256                 { "uuid",                  required_argument, NULL, ARG_UUID              },
257                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
258                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
259                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
260                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
261                 { "bind",                  required_argument, NULL, ARG_BIND              },
262                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
263                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
264                 { "machine",               required_argument, NULL, 'M'                   },
265                 { "slice",                 required_argument, NULL, 'S'                   },
266                 { "setenv",                required_argument, NULL, ARG_SETENV            },
267                 { "selinux-context",       required_argument, NULL, 'Z'                   },
268                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
269                 { "quiet",                 no_argument,       NULL, 'q'                   },
270                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
271                 { "register",              required_argument, NULL, ARG_REGISTER          },
272                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
273                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
274                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
275                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
276                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
277                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
278                 { "image",                 required_argument, NULL, 'i'                   },
279                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
280                 {}
281         };
282
283         int c, r;
284         uint64_t plus = 0, minus = 0;
285
286         assert(argc >= 0);
287         assert(argv);
288
289         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
290
291                 switch (c) {
292
293                 case 'h':
294                         help();
295                         return 0;
296
297                 case ARG_VERSION:
298                         puts(PACKAGE_STRING);
299                         puts(SYSTEMD_FEATURES);
300                         return 0;
301
302                 case 'D':
303                         free(arg_directory);
304                         arg_directory = canonicalize_file_name(optarg);
305                         if (!arg_directory) {
306                                 log_error_errno(errno, "Invalid root directory: %m");
307                                 return -ENOMEM;
308                         }
309
310                         break;
311
312                 case 'i':
313                         arg_image = optarg;
314                         break;
315
316                 case 'u':
317                         free(arg_user);
318                         arg_user = strdup(optarg);
319                         if (!arg_user)
320                                 return log_oom();
321
322                         break;
323
324                 case ARG_NETWORK_BRIDGE:
325                         arg_network_bridge = optarg;
326
327                         /* fall through */
328
329                 case ARG_NETWORK_VETH:
330                         arg_network_veth = true;
331                         arg_private_network = true;
332                         break;
333
334                 case ARG_NETWORK_INTERFACE:
335                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
336                                 return log_oom();
337
338                         arg_private_network = true;
339                         break;
340
341                 case ARG_NETWORK_MACVLAN:
342                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
343                                 return log_oom();
344
345                         /* fall through */
346
347                 case ARG_PRIVATE_NETWORK:
348                         arg_private_network = true;
349                         break;
350
351                 case 'b':
352                         arg_boot = true;
353                         break;
354
355                 case ARG_UUID:
356                         r = sd_id128_from_string(optarg, &arg_uuid);
357                         if (r < 0) {
358                                 log_error("Invalid UUID: %s", optarg);
359                                 return r;
360                         }
361                         break;
362
363                 case 'S':
364                         arg_slice = optarg;
365                         break;
366
367                 case 'M':
368                         if (isempty(optarg)) {
369                                 free(arg_machine);
370                                 arg_machine = NULL;
371                         } else {
372
373                                 if (!hostname_is_valid(optarg)) {
374                                         log_error("Invalid machine name: %s", optarg);
375                                         return -EINVAL;
376                                 }
377
378                                 free(arg_machine);
379                                 arg_machine = strdup(optarg);
380                                 if (!arg_machine)
381                                         return log_oom();
382
383                                 break;
384                         }
385
386                 case 'Z':
387                         arg_selinux_context = optarg;
388                         break;
389
390                 case 'L':
391                         arg_selinux_apifs_context = optarg;
392                         break;
393
394                 case ARG_READ_ONLY:
395                         arg_read_only = true;
396                         break;
397
398                 case ARG_CAPABILITY:
399                 case ARG_DROP_CAPABILITY: {
400                         const char *state, *word;
401                         size_t length;
402
403                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
404                                 _cleanup_free_ char *t;
405
406                                 t = strndup(word, length);
407                                 if (!t)
408                                         return log_oom();
409
410                                 if (streq(t, "all")) {
411                                         if (c == ARG_CAPABILITY)
412                                                 plus = (uint64_t) -1;
413                                         else
414                                                 minus = (uint64_t) -1;
415                                 } else {
416                                         int cap;
417
418                                         cap = capability_from_name(t);
419                                         if (cap < 0) {
420                                                 log_error("Failed to parse capability %s.", t);
421                                                 return -EINVAL;
422                                         }
423
424                                         if (c == ARG_CAPABILITY)
425                                                 plus |= 1ULL << (uint64_t) cap;
426                                         else
427                                                 minus |= 1ULL << (uint64_t) cap;
428                                 }
429                         }
430
431                         break;
432                 }
433
434                 case 'j':
435                         arg_link_journal = LINK_GUEST;
436                         arg_link_journal_try = true;
437                         break;
438
439                 case ARG_LINK_JOURNAL:
440                         if (streq(optarg, "auto"))
441                                 arg_link_journal = LINK_AUTO;
442                         else if (streq(optarg, "no"))
443                                 arg_link_journal = LINK_NO;
444                         else if (streq(optarg, "guest"))
445                                 arg_link_journal = LINK_GUEST;
446                         else if (streq(optarg, "host"))
447                                 arg_link_journal = LINK_HOST;
448                         else if (streq(optarg, "try-guest")) {
449                                 arg_link_journal = LINK_GUEST;
450                                 arg_link_journal_try = true;
451                         } else if (streq(optarg, "try-host")) {
452                                 arg_link_journal = LINK_HOST;
453                                 arg_link_journal_try = true;
454                         } else {
455                                 log_error("Failed to parse link journal mode %s", optarg);
456                                 return -EINVAL;
457                         }
458
459                         break;
460
461                 case ARG_BIND:
462                 case ARG_BIND_RO: {
463                         _cleanup_free_ char *a = NULL, *b = NULL;
464                         char *e;
465                         char ***x;
466
467                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
468
469                         e = strchr(optarg, ':');
470                         if (e) {
471                                 a = strndup(optarg, e - optarg);
472                                 b = strdup(e + 1);
473                         } else {
474                                 a = strdup(optarg);
475                                 b = strdup(optarg);
476                         }
477
478                         if (!a || !b)
479                                 return log_oom();
480
481                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
482                                 log_error("Invalid bind mount specification: %s", optarg);
483                                 return -EINVAL;
484                         }
485
486                         r = strv_extend(x, a);
487                         if (r < 0)
488                                 return log_oom();
489
490                         r = strv_extend(x, b);
491                         if (r < 0)
492                                 return log_oom();
493
494                         break;
495                 }
496
497                 case ARG_TMPFS: {
498                         _cleanup_free_ char *a = NULL, *b = NULL;
499                         char *e;
500
501                         e = strchr(optarg, ':');
502                         if (e) {
503                                 a = strndup(optarg, e - optarg);
504                                 b = strdup(e + 1);
505                         } else {
506                                 a = strdup(optarg);
507                                 b = strdup("mode=0755");
508                         }
509
510                         if (!a || !b)
511                                 return log_oom();
512
513                         if (!path_is_absolute(a)) {
514                                 log_error("Invalid tmpfs specification: %s", optarg);
515                                 return -EINVAL;
516                         }
517
518                         r = strv_push(&arg_tmpfs, a);
519                         if (r < 0)
520                                 return log_oom();
521
522                         a = NULL;
523
524                         r = strv_push(&arg_tmpfs, b);
525                         if (r < 0)
526                                 return log_oom();
527
528                         b = NULL;
529
530                         break;
531                 }
532
533                 case ARG_SETENV: {
534                         char **n;
535
536                         if (!env_assignment_is_valid(optarg)) {
537                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
538                                 return -EINVAL;
539                         }
540
541                         n = strv_env_set(arg_setenv, optarg);
542                         if (!n)
543                                 return log_oom();
544
545                         strv_free(arg_setenv);
546                         arg_setenv = n;
547                         break;
548                 }
549
550                 case 'q':
551                         arg_quiet = true;
552                         break;
553
554                 case ARG_SHARE_SYSTEM:
555                         arg_share_system = true;
556                         break;
557
558                 case ARG_REGISTER:
559                         r = parse_boolean(optarg);
560                         if (r < 0) {
561                                 log_error("Failed to parse --register= argument: %s", optarg);
562                                 return r;
563                         }
564
565                         arg_register = r;
566                         break;
567
568                 case ARG_KEEP_UNIT:
569                         arg_keep_unit = true;
570                         break;
571
572                 case ARG_PERSONALITY:
573
574                         arg_personality = personality_from_string(optarg);
575                         if (arg_personality == 0xffffffffLU) {
576                                 log_error("Unknown or unsupported personality '%s'.", optarg);
577                                 return -EINVAL;
578                         }
579
580                         break;
581
582                 case ARG_VOLATILE:
583
584                         if (!optarg)
585                                 arg_volatile = VOLATILE_YES;
586                         else {
587                                 r = parse_boolean(optarg);
588                                 if (r < 0) {
589                                         if (streq(optarg, "state"))
590                                                 arg_volatile = VOLATILE_STATE;
591                                         else {
592                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
593                                                 return r;
594                                         }
595                                 } else
596                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
597                         }
598
599                         break;
600
601                 case '?':
602                         return -EINVAL;
603
604                 default:
605                         assert_not_reached("Unhandled option");
606                 }
607
608         if (arg_share_system)
609                 arg_register = false;
610
611         if (arg_boot && arg_share_system) {
612                 log_error("--boot and --share-system may not be combined.");
613                 return -EINVAL;
614         }
615
616         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
617                 log_error("--keep-unit may not be used when invoked from a user session.");
618                 return -EINVAL;
619         }
620
621         if (arg_directory && arg_image) {
622                 log_error("--directory= and --image= may not be combined.");
623                 return -EINVAL;
624         }
625
626         if (arg_volatile != VOLATILE_NO && arg_read_only) {
627                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
628                 return -EINVAL;
629         }
630
631         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
632
633         return 1;
634 }
635
636 static int mount_all(const char *dest) {
637
638         typedef struct MountPoint {
639                 const char *what;
640                 const char *where;
641                 const char *type;
642                 const char *options;
643                 unsigned long flags;
644                 bool fatal;
645         } MountPoint;
646
647         static const MountPoint mount_table[] = {
648                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
649                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
650                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
651                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
652                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
653                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
654                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
655                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
656 #ifdef HAVE_SELINUX
657                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
658                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
659 #endif
660         };
661
662         unsigned k;
663         int r = 0;
664
665         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
666                 _cleanup_free_ char *where = NULL;
667 #ifdef HAVE_SELINUX
668                 _cleanup_free_ char *options = NULL;
669 #endif
670                 const char *o;
671                 int t;
672
673                 where = strjoin(dest, "/", mount_table[k].where, NULL);
674                 if (!where)
675                         return log_oom();
676
677                 t = path_is_mount_point(where, true);
678                 if (t < 0) {
679                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
680
681                         if (r == 0)
682                                 r = t;
683
684                         continue;
685                 }
686
687                 /* Skip this entry if it is not a remount. */
688                 if (mount_table[k].what && t > 0)
689                         continue;
690
691                 t = mkdir_p(where, 0755);
692                 if (t < 0) {
693                         if (mount_table[k].fatal) {
694                                log_error_errno(t, "Failed to create directory %s: %m", where);
695
696                                 if (r == 0)
697                                         r = t;
698                         } else
699                                log_warning_errno(t, "Failed to create directory %s: %m", where);
700
701                         continue;
702                 }
703
704 #ifdef HAVE_SELINUX
705                 if (arg_selinux_apifs_context &&
706                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
707                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
708                         if (!options)
709                                 return log_oom();
710
711                         o = options;
712                 } else
713 #endif
714                         o = mount_table[k].options;
715
716
717                 if (mount(mount_table[k].what,
718                           where,
719                           mount_table[k].type,
720                           mount_table[k].flags,
721                           o) < 0) {
722
723                         if (mount_table[k].fatal) {
724                                 log_error_errno(errno, "mount(%s) failed: %m", where);
725
726                                 if (r == 0)
727                                         r = -errno;
728                         } else
729                                 log_warning_errno(errno, "mount(%s) failed: %m", where);
730                 }
731         }
732
733         return r;
734 }
735
736 static int mount_binds(const char *dest, char **l, bool ro) {
737         char **x, **y;
738
739         STRV_FOREACH_PAIR(x, y, l) {
740                 _cleanup_free_ char *where = NULL;
741                 struct stat source_st, dest_st;
742                 int r;
743
744                 if (stat(*x, &source_st) < 0)
745                         return log_error_errno(errno, "Failed to stat %s: %m", *x);
746
747                 where = strappend(dest, *y);
748                 if (!where)
749                         return log_oom();
750
751                 r = stat(where, &dest_st);
752                 if (r == 0) {
753                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
754                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
755                                 return -EINVAL;
756                         }
757                 } else if (errno == ENOENT) {
758                         r = mkdir_parents_label(where, 0755);
759                         if (r < 0)
760                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
761                 } else {
762                         log_error_errno(errno, "Failed to bind mount %s: %m", *x);
763                         return -errno;
764                 }
765
766                 /* Create the mount point, but be conservative -- refuse to create block
767                  * and char devices. */
768                 if (S_ISDIR(source_st.st_mode)) {
769                         r = mkdir_label(where, 0755);
770                         if (r < 0 && errno != EEXIST)
771                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
772                 } else if (S_ISFIFO(source_st.st_mode)) {
773                         r = mkfifo(where, 0644);
774                         if (r < 0 && errno != EEXIST)
775                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
776                 } else if (S_ISSOCK(source_st.st_mode)) {
777                         r = mknod(where, 0644 | S_IFSOCK, 0);
778                         if (r < 0 && errno != EEXIST)
779                                 return log_error_errno(errno, "Failed to create mount point %s: %m", where);
780                 } else if (S_ISREG(source_st.st_mode)) {
781                         r = touch(where);
782                         if (r < 0)
783                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
784                 } else {
785                         log_error("Refusing to create mountpoint for file: %s", *x);
786                         return -ENOTSUP;
787                 }
788
789                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0)
790                         return log_error_errno(errno, "mount(%s) failed: %m", where);
791
792                 if (ro) {
793                         r = bind_remount_recursive(where, true);
794                         if (r < 0)
795                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
796                 }
797         }
798
799         return 0;
800 }
801
802 static int mount_tmpfs(const char *dest) {
803         char **i, **o;
804
805         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
806                 _cleanup_free_ char *where = NULL;
807                 int r;
808
809                 where = strappend(dest, *i);
810                 if (!where)
811                         return log_oom();
812
813                 r = mkdir_label(where, 0755);
814                 if (r < 0 && r != -EEXIST)
815                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
816
817                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
818                         return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
819         }
820
821         return 0;
822 }
823
824 static int setup_timezone(const char *dest) {
825         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
826         char *z, *y;
827         int r;
828
829         assert(dest);
830
831         /* Fix the timezone, if possible */
832         r = readlink_malloc("/etc/localtime", &p);
833         if (r < 0) {
834                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
835                 return 0;
836         }
837
838         z = path_startswith(p, "../usr/share/zoneinfo/");
839         if (!z)
840                 z = path_startswith(p, "/usr/share/zoneinfo/");
841         if (!z) {
842                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
843                 return 0;
844         }
845
846         where = strappend(dest, "/etc/localtime");
847         if (!where)
848                 return log_oom();
849
850         r = readlink_malloc(where, &q);
851         if (r >= 0) {
852                 y = path_startswith(q, "../usr/share/zoneinfo/");
853                 if (!y)
854                         y = path_startswith(q, "/usr/share/zoneinfo/");
855
856                 /* Already pointing to the right place? Then do nothing .. */
857                 if (y && streq(y, z))
858                         return 0;
859         }
860
861         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
862         if (!check)
863                 return log_oom();
864
865         if (access(check, F_OK) < 0) {
866                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
867                 return 0;
868         }
869
870         what = strappend("../usr/share/zoneinfo/", z);
871         if (!what)
872                 return log_oom();
873
874         r = mkdir_parents(where, 0755);
875         if (r < 0) {
876                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
877
878                 return 0;
879         }
880
881         r = unlink(where);
882         if (r < 0 && errno != ENOENT) {
883                 log_error_errno(errno, "Failed to remove existing timezone info %s in container: %m", where);
884
885                 return 0;
886         }
887
888         if (symlink(what, where) < 0) {
889                 log_error_errno(errno, "Failed to correct timezone of container: %m");
890                 return 0;
891         }
892
893         return 0;
894 }
895
896 static int setup_resolv_conf(const char *dest) {
897         _cleanup_free_ char *where = NULL;
898         int r;
899
900         assert(dest);
901
902         if (arg_private_network)
903                 return 0;
904
905         /* Fix resolv.conf, if possible */
906         where = strappend(dest, "/etc/resolv.conf");
907         if (!where)
908                 return log_oom();
909
910         /* We don't really care for the results of this really. If it
911          * fails, it fails, but meh... */
912         r = mkdir_parents(where, 0755);
913         if (r < 0) {
914                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
915
916                 return 0;
917         }
918
919         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
920         if (r < 0) {
921                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
922
923                 return 0;
924         }
925
926         return 0;
927 }
928
929 static int setup_volatile_state(const char *directory) {
930         const char *p;
931         int r;
932
933         assert(directory);
934
935         if (arg_volatile != VOLATILE_STATE)
936                 return 0;
937
938         /* --volatile=state means we simply overmount /var
939            with a tmpfs, and the rest read-only. */
940
941         r = bind_remount_recursive(directory, true);
942         if (r < 0)
943                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
944
945         p = strappenda(directory, "/var");
946         r = mkdir(p, 0755);
947         if (r < 0 && errno != EEXIST)
948                 return log_error_errno(errno, "Failed to create %s: %m", directory);
949
950         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0)
951                 return log_error_errno(errno, "Failed to mount tmpfs to /var: %m");
952
953         return 0;
954 }
955
956 static int setup_volatile(const char *directory) {
957         bool tmpfs_mounted = false, bind_mounted = false;
958         char template[] = "/tmp/nspawn-volatile-XXXXXX";
959         const char *f, *t;
960         int r;
961
962         assert(directory);
963
964         if (arg_volatile != VOLATILE_YES)
965                 return 0;
966
967         /* --volatile=yes means we mount a tmpfs to the root dir, and
968            the original /usr to use inside it, and that read-only. */
969
970         if (!mkdtemp(template))
971                 return log_error_errno(errno, "Failed to create temporary directory: %m");
972
973         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
974                 log_error_errno(errno, "Failed to mount tmpfs for root directory: %m");
975                 r = -errno;
976                 goto fail;
977         }
978
979         tmpfs_mounted = true;
980
981         f = strappenda(directory, "/usr");
982         t = strappenda(template, "/usr");
983
984         r = mkdir(t, 0755);
985         if (r < 0 && errno != EEXIST) {
986                 log_error_errno(errno, "Failed to create %s: %m", t);
987                 r = -errno;
988                 goto fail;
989         }
990
991         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
992                 log_error_errno(errno, "Failed to create /usr bind mount: %m");
993                 r = -errno;
994                 goto fail;
995         }
996
997         bind_mounted = true;
998
999         r = bind_remount_recursive(t, true);
1000         if (r < 0) {
1001                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1002                 goto fail;
1003         }
1004
1005         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1006                 log_error_errno(errno, "Failed to move root mount: %m");
1007                 r = -errno;
1008                 goto fail;
1009         }
1010
1011         rmdir(template);
1012
1013         return 0;
1014
1015 fail:
1016         if (bind_mounted)
1017                 umount(t);
1018         if (tmpfs_mounted)
1019                 umount(template);
1020         rmdir(template);
1021         return r;
1022 }
1023
1024 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1025
1026         snprintf(s, 37,
1027                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1028                  SD_ID128_FORMAT_VAL(id));
1029
1030         return s;
1031 }
1032
1033 static int setup_boot_id(const char *dest) {
1034         _cleanup_free_ char *from = NULL, *to = NULL;
1035         sd_id128_t rnd = {};
1036         char as_uuid[37];
1037         int r;
1038
1039         assert(dest);
1040
1041         if (arg_share_system)
1042                 return 0;
1043
1044         /* Generate a new randomized boot ID, so that each boot-up of
1045          * the container gets a new one */
1046
1047         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1048         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1049         if (!from || !to)
1050                 return log_oom();
1051
1052         r = sd_id128_randomize(&rnd);
1053         if (r < 0)
1054                 return log_error_errno(r, "Failed to generate random boot id: %m");
1055
1056         id128_format_as_uuid(rnd, as_uuid);
1057
1058         r = write_string_file(from, as_uuid);
1059         if (r < 0)
1060                 return log_error_errno(r, "Failed to write boot id: %m");
1061
1062         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1063                 log_error_errno(errno, "Failed to bind mount boot id: %m");
1064                 r = -errno;
1065         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1066                 log_warning_errno(errno, "Failed to make boot id read-only: %m");
1067
1068         unlink(from);
1069         return r;
1070 }
1071
1072 static int copy_devnodes(const char *dest) {
1073
1074         static const char devnodes[] =
1075                 "null\0"
1076                 "zero\0"
1077                 "full\0"
1078                 "random\0"
1079                 "urandom\0"
1080                 "tty\0"
1081                 "net/tun\0";
1082
1083         const char *d;
1084         int r = 0;
1085         _cleanup_umask_ mode_t u;
1086
1087         assert(dest);
1088
1089         u = umask(0000);
1090
1091         NULSTR_FOREACH(d, devnodes) {
1092                 _cleanup_free_ char *from = NULL, *to = NULL;
1093                 struct stat st;
1094
1095                 from = strappend("/dev/", d);
1096                 to = strjoin(dest, "/dev/", d, NULL);
1097                 if (!from || !to)
1098                         return log_oom();
1099
1100                 if (stat(from, &st) < 0) {
1101
1102                         if (errno != ENOENT)
1103                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
1104
1105                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1106
1107                         log_error("%s is not a char or block device, cannot copy", from);
1108                         return -EIO;
1109
1110                 } else {
1111                         r = mkdir_parents(to, 0775);
1112                         if (r < 0) {
1113                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1114                                 return -r;
1115                         }
1116
1117                         if (mknod(to, st.st_mode, st.st_rdev) < 0)
1118                                 return log_error_errno(errno, "mknod(%s) failed: %m", dest);
1119                 }
1120         }
1121
1122         return r;
1123 }
1124
1125 static int setup_ptmx(const char *dest) {
1126         _cleanup_free_ char *p = NULL;
1127
1128         p = strappend(dest, "/dev/ptmx");
1129         if (!p)
1130                 return log_oom();
1131
1132         if (symlink("pts/ptmx", p) < 0)
1133                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
1134
1135         return 0;
1136 }
1137
1138 static int setup_dev_console(const char *dest, const char *console) {
1139         _cleanup_umask_ mode_t u;
1140         const char *to;
1141         struct stat st;
1142         int r;
1143
1144         assert(dest);
1145         assert(console);
1146
1147         u = umask(0000);
1148
1149         if (stat("/dev/null", &st) < 0)
1150                 return log_error_errno(errno, "Failed to stat /dev/null: %m");
1151
1152         r = chmod_and_chown(console, 0600, 0, 0);
1153         if (r < 0)
1154                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1155
1156         /* We need to bind mount the right tty to /dev/console since
1157          * ptys can only exist on pts file systems. To have something
1158          * to bind mount things on we create a device node first, and
1159          * use /dev/null for that since we the cgroups device policy
1160          * allows us to create that freely, while we cannot create
1161          * /dev/console. (Note that the major minor doesn't actually
1162          * matter here, since we mount it over anyway). */
1163
1164         to = strappenda(dest, "/dev/console");
1165         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
1166                 return log_error_errno(errno, "mknod() for /dev/console failed: %m");
1167
1168         if (mount(console, to, "bind", MS_BIND, NULL) < 0)
1169                 return log_error_errno(errno, "Bind mount for /dev/console failed: %m");
1170
1171         return 0;
1172 }
1173
1174 static int setup_kmsg(const char *dest, int kmsg_socket) {
1175         _cleanup_free_ char *from = NULL, *to = NULL;
1176         int r, fd, k;
1177         _cleanup_umask_ mode_t u;
1178         union {
1179                 struct cmsghdr cmsghdr;
1180                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1181         } control = {};
1182         struct msghdr mh = {
1183                 .msg_control = &control,
1184                 .msg_controllen = sizeof(control),
1185         };
1186         struct cmsghdr *cmsg;
1187
1188         assert(dest);
1189         assert(kmsg_socket >= 0);
1190
1191         u = umask(0000);
1192
1193         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1194          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1195          * on the reading side behave very similar to /proc/kmsg,
1196          * their writing side behaves differently from /dev/kmsg in
1197          * that writing blocks when nothing is reading. In order to
1198          * avoid any problems with containers deadlocking due to this
1199          * we simply make /dev/kmsg unavailable to the container. */
1200         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1201             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1202                 return log_oom();
1203
1204         if (mkfifo(from, 0600) < 0)
1205                 return log_error_errno(errno, "mkfifo() for /dev/kmsg failed: %m");
1206
1207         r = chmod_and_chown(from, 0600, 0, 0);
1208         if (r < 0)
1209                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1210
1211         if (mount(from, to, "bind", MS_BIND, NULL) < 0)
1212                 return log_error_errno(errno, "Bind mount for /proc/kmsg failed: %m");
1213
1214         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1215         if (fd < 0)
1216                 return log_error_errno(errno, "Failed to open fifo: %m");
1217
1218         cmsg = CMSG_FIRSTHDR(&mh);
1219         cmsg->cmsg_level = SOL_SOCKET;
1220         cmsg->cmsg_type = SCM_RIGHTS;
1221         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1222         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1223
1224         mh.msg_controllen = cmsg->cmsg_len;
1225
1226         /* Store away the fd in the socket, so that it stays open as
1227          * long as we run the child */
1228         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1229         safe_close(fd);
1230
1231         if (k < 0)
1232                 return log_error_errno(errno, "Failed to send FIFO fd: %m");
1233
1234         /* And now make the FIFO unavailable as /dev/kmsg... */
1235         unlink(from);
1236         return 0;
1237 }
1238
1239 static int setup_hostname(void) {
1240
1241         if (arg_share_system)
1242                 return 0;
1243
1244         if (sethostname_idempotent(arg_machine) < 0)
1245                 return -errno;
1246
1247         return 0;
1248 }
1249
1250 static int setup_journal(const char *directory) {
1251         sd_id128_t machine_id, this_id;
1252         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1253         char *id;
1254         int r;
1255
1256         p = strappend(directory, "/etc/machine-id");
1257         if (!p)
1258                 return log_oom();
1259
1260         r = read_one_line_file(p, &b);
1261         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1262                 return 0;
1263         else if (r < 0)
1264                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1265
1266         id = strstrip(b);
1267         if (isempty(id) && arg_link_journal == LINK_AUTO)
1268                 return 0;
1269
1270         /* Verify validity */
1271         r = sd_id128_from_string(id, &machine_id);
1272         if (r < 0)
1273                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1274
1275         r = sd_id128_get_machine(&this_id);
1276         if (r < 0)
1277                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1278
1279         if (sd_id128_equal(machine_id, this_id)) {
1280                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1281                          "Host and machine ids are equal (%s): refusing to link journals", id);
1282                 if (arg_link_journal == LINK_AUTO)
1283                         return 0;
1284                 return
1285                         -EEXIST;
1286         }
1287
1288         if (arg_link_journal == LINK_NO)
1289                 return 0;
1290
1291         free(p);
1292         p = strappend("/var/log/journal/", id);
1293         q = strjoin(directory, "/var/log/journal/", id, NULL);
1294         if (!p || !q)
1295                 return log_oom();
1296
1297         if (path_is_mount_point(p, false) > 0) {
1298                 if (arg_link_journal != LINK_AUTO) {
1299                         log_error("%s: already a mount point, refusing to use for journal", p);
1300                         return -EEXIST;
1301                 }
1302
1303                 return 0;
1304         }
1305
1306         if (path_is_mount_point(q, false) > 0) {
1307                 if (arg_link_journal != LINK_AUTO) {
1308                         log_error("%s: already a mount point, refusing to use for journal", q);
1309                         return -EEXIST;
1310                 }
1311
1312                 return 0;
1313         }
1314
1315         r = readlink_and_make_absolute(p, &d);
1316         if (r >= 0) {
1317                 if ((arg_link_journal == LINK_GUEST ||
1318                      arg_link_journal == LINK_AUTO) &&
1319                     path_equal(d, q)) {
1320
1321                         r = mkdir_p(q, 0755);
1322                         if (r < 0)
1323                                 log_warning_errno(errno, "Failed to create directory %s: %m", q);
1324                         return 0;
1325                 }
1326
1327                 if (unlink(p) < 0)
1328                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
1329         } else if (r == -EINVAL) {
1330
1331                 if (arg_link_journal == LINK_GUEST &&
1332                     rmdir(p) < 0) {
1333
1334                         if (errno == ENOTDIR) {
1335                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1336                                 return r;
1337                         } else {
1338                                 log_error_errno(errno, "Failed to remove %s: %m", p);
1339                                 return -errno;
1340                         }
1341                 }
1342         } else if (r != -ENOENT) {
1343                 log_error_errno(errno, "readlink(%s) failed: %m", p);
1344                 return r;
1345         }
1346
1347         if (arg_link_journal == LINK_GUEST) {
1348
1349                 if (symlink(q, p) < 0) {
1350                         if (arg_link_journal_try) {
1351                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1352                                 return 0;
1353                         } else {
1354                                 log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
1355                                 return -errno;
1356                         }
1357                 }
1358
1359                 r = mkdir_p(q, 0755);
1360                 if (r < 0)
1361                         log_warning_errno(errno, "Failed to create directory %s: %m", q);
1362                 return 0;
1363         }
1364
1365         if (arg_link_journal == LINK_HOST) {
1366                 /* don't create parents here -- if the host doesn't have
1367                  * permanent journal set up, don't force it here */
1368                 r = mkdir(p, 0755);
1369                 if (r < 0) {
1370                         if (arg_link_journal_try) {
1371                                 log_debug_errno(errno, "Failed to create %s, skipping journal setup: %m", p);
1372                                 return 0;
1373                         } else {
1374                                 log_error_errno(errno, "Failed to create %s: %m", p);
1375                                 return r;
1376                         }
1377                 }
1378
1379         } else if (access(p, F_OK) < 0)
1380                 return 0;
1381
1382         if (dir_is_empty(q) == 0)
1383                 log_warning("%s is not empty, proceeding anyway.", q);
1384
1385         r = mkdir_p(q, 0755);
1386         if (r < 0) {
1387                 log_error_errno(errno, "Failed to create %s: %m", q);
1388                 return r;
1389         }
1390
1391         if (mount(p, q, "bind", MS_BIND, NULL) < 0)
1392                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
1393
1394         return 0;
1395 }
1396
1397 static int drop_capabilities(void) {
1398         return capability_bounding_set_drop(~arg_retain, false);
1399 }
1400
1401 static int register_machine(pid_t pid, int local_ifindex) {
1402         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1403         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1404         int r;
1405
1406         if (!arg_register)
1407                 return 0;
1408
1409         r = sd_bus_default_system(&bus);
1410         if (r < 0)
1411                 return log_error_errno(r, "Failed to open system bus: %m");
1412
1413         if (arg_keep_unit) {
1414                 r = sd_bus_call_method(
1415                                 bus,
1416                                 "org.freedesktop.machine1",
1417                                 "/org/freedesktop/machine1",
1418                                 "org.freedesktop.machine1.Manager",
1419                                 "RegisterMachineWithNetwork",
1420                                 &error,
1421                                 NULL,
1422                                 "sayssusai",
1423                                 arg_machine,
1424                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1425                                 "nspawn",
1426                                 "container",
1427                                 (uint32_t) pid,
1428                                 strempty(arg_directory),
1429                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1430         } else {
1431                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1432
1433                 r = sd_bus_message_new_method_call(
1434                                 bus,
1435                                 &m,
1436                                 "org.freedesktop.machine1",
1437                                 "/org/freedesktop/machine1",
1438                                 "org.freedesktop.machine1.Manager",
1439                                 "CreateMachineWithNetwork");
1440                 if (r < 0)
1441                         return log_error_errno(r, "Failed to create message: %m");
1442
1443                 r = sd_bus_message_append(
1444                                 m,
1445                                 "sayssusai",
1446                                 arg_machine,
1447                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1448                                 "nspawn",
1449                                 "container",
1450                                 (uint32_t) pid,
1451                                 strempty(arg_directory),
1452                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1453                 if (r < 0)
1454                         return log_error_errno(r, "Failed to append message arguments: %m");
1455
1456                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1457                 if (r < 0)
1458                         return log_error_errno(r, "Failed to open container: %m");
1459
1460                 if (!isempty(arg_slice)) {
1461                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1462                         if (r < 0)
1463                                 return log_error_errno(r, "Failed to append slice: %m");
1464                 }
1465
1466                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1467                 if (r < 0)
1468                         return log_error_errno(r, "Failed to add device policy: %m");
1469
1470                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1471                                           /* Allow the container to
1472                                            * access and create the API
1473                                            * device nodes, so that
1474                                            * PrivateDevices= in the
1475                                            * container can work
1476                                            * fine */
1477                                           "/dev/null", "rwm",
1478                                           "/dev/zero", "rwm",
1479                                           "/dev/full", "rwm",
1480                                           "/dev/random", "rwm",
1481                                           "/dev/urandom", "rwm",
1482                                           "/dev/tty", "rwm",
1483                                           "/dev/net/tun", "rwm",
1484                                           /* Allow the container
1485                                            * access to ptys. However,
1486                                            * do not permit the
1487                                            * container to ever create
1488                                            * these device nodes. */
1489                                           "/dev/pts/ptmx", "rw",
1490                                           "char-pts", "rw");
1491                 if (r < 0)
1492                         return log_error_errno(r, "Failed to add device whitelist: %m");
1493
1494                 r = sd_bus_message_close_container(m);
1495                 if (r < 0)
1496                         return log_error_errno(r, "Failed to close container: %m");
1497
1498                 r = sd_bus_call(bus, m, 0, &error, NULL);
1499         }
1500
1501         if (r < 0) {
1502                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1503                 return r;
1504         }
1505
1506         return 0;
1507 }
1508
1509 static int terminate_machine(pid_t pid) {
1510         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1511         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1512         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1513         const char *path;
1514         int r;
1515
1516         if (!arg_register)
1517                 return 0;
1518
1519         r = sd_bus_default_system(&bus);
1520         if (r < 0)
1521                 return log_error_errno(r, "Failed to open system bus: %m");
1522
1523         r = sd_bus_call_method(
1524                         bus,
1525                         "org.freedesktop.machine1",
1526                         "/org/freedesktop/machine1",
1527                         "org.freedesktop.machine1.Manager",
1528                         "GetMachineByPID",
1529                         &error,
1530                         &reply,
1531                         "u",
1532                         (uint32_t) pid);
1533         if (r < 0) {
1534                 /* Note that the machine might already have been
1535                  * cleaned up automatically, hence don't consider it a
1536                  * failure if we cannot get the machine object. */
1537                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1538                 return 0;
1539         }
1540
1541         r = sd_bus_message_read(reply, "o", &path);
1542         if (r < 0)
1543                 return bus_log_parse_error(r);
1544
1545         r = sd_bus_call_method(
1546                         bus,
1547                         "org.freedesktop.machine1",
1548                         path,
1549                         "org.freedesktop.machine1.Machine",
1550                         "Terminate",
1551                         &error,
1552                         NULL,
1553                         NULL);
1554         if (r < 0) {
1555                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1556                 return 0;
1557         }
1558
1559         return 0;
1560 }
1561
1562 static int reset_audit_loginuid(void) {
1563         _cleanup_free_ char *p = NULL;
1564         int r;
1565
1566         if (arg_share_system)
1567                 return 0;
1568
1569         r = read_one_line_file("/proc/self/loginuid", &p);
1570         if (r == -ENOENT)
1571                 return 0;
1572         if (r < 0)
1573                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1574
1575         /* Already reset? */
1576         if (streq(p, "4294967295"))
1577                 return 0;
1578
1579         r = write_string_file("/proc/self/loginuid", "4294967295");
1580         if (r < 0) {
1581                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1582                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1583                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1584                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1585                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1586
1587                 sleep(5);
1588         }
1589
1590         return 0;
1591 }
1592
1593 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1594 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1595 #define MACVLAN_HASH_KEY SD_ID128_MAKE(00,13,6d,bc,66,83,44,81,bb,0c,f9,51,1f,24,a6,6f)
1596
1597 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key, uint64_t idx) {
1598         uint8_t result[8];
1599         size_t l, sz;
1600         uint8_t *v, *i;
1601         int r;
1602
1603         l = strlen(arg_machine);
1604         sz = sizeof(sd_id128_t) + l;
1605         if (idx > 0)
1606                 sz += sizeof(idx);
1607
1608         v = alloca(sz);
1609
1610         /* fetch some persistent data unique to the host */
1611         r = sd_id128_get_machine((sd_id128_t*) v);
1612         if (r < 0)
1613                 return r;
1614
1615         /* combine with some data unique (on this host) to this
1616          * container instance */
1617         i = mempcpy(v + sizeof(sd_id128_t), arg_machine, l);
1618         if (idx > 0) {
1619                 idx = htole64(idx);
1620                 memcpy(i, &idx, sizeof(idx));
1621         }
1622
1623         /* Let's hash the host machine ID plus the container name. We
1624          * use a fixed, but originally randomly created hash key here. */
1625         siphash24(result, v, sz, hash_key.bytes);
1626
1627         assert_cc(ETH_ALEN <= sizeof(result));
1628         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1629
1630         /* see eth_random_addr in the kernel */
1631         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1632         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1633
1634         return 0;
1635 }
1636
1637 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1638         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1639         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1640         struct ether_addr mac_host, mac_container;
1641         int r, i;
1642
1643         if (!arg_private_network)
1644                 return 0;
1645
1646         if (!arg_network_veth)
1647                 return 0;
1648
1649         /* Use two different interface name prefixes depending whether
1650          * we are in bridge mode or not. */
1651         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1652                  arg_network_bridge ? "vb" : "ve", arg_machine);
1653
1654         r = generate_mac(&mac_container, CONTAINER_HASH_KEY, 0);
1655         if (r < 0)
1656                 return log_error_errno(r, "Failed to generate predictable MAC address for container side: %m");
1657
1658         r = generate_mac(&mac_host, HOST_HASH_KEY, 0);
1659         if (r < 0)
1660                 return log_error_errno(r, "Failed to generate predictable MAC address for host side: %m");
1661
1662         r = sd_rtnl_open(&rtnl, 0);
1663         if (r < 0)
1664                 return log_error_errno(r, "Failed to connect to netlink: %m");
1665
1666         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1667         if (r < 0)
1668                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1669
1670         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1671         if (r < 0)
1672                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1673
1674         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1675         if (r < 0)
1676                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1677
1678         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1679         if (r < 0)
1680                 return log_error_errno(r, "Failed to open netlink container: %m");
1681
1682         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1683         if (r < 0)
1684                 return log_error_errno(r, "Failed to open netlink container: %m");
1685
1686         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1687         if (r < 0)
1688                 return log_error_errno(r, "Failed to open netlink container: %m");
1689
1690         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1691         if (r < 0)
1692                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1693
1694         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1695         if (r < 0)
1696                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1697
1698         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1699         if (r < 0)
1700                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1701
1702         r = sd_rtnl_message_close_container(m);
1703         if (r < 0)
1704                 return log_error_errno(r, "Failed to close netlink container: %m");
1705
1706         r = sd_rtnl_message_close_container(m);
1707         if (r < 0)
1708                 return log_error_errno(r, "Failed to close netlink container: %m");
1709
1710         r = sd_rtnl_message_close_container(m);
1711         if (r < 0)
1712                 return log_error_errno(r, "Failed to close netlink container: %m");
1713
1714         r = sd_rtnl_call(rtnl, m, 0, NULL);
1715         if (r < 0)
1716                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1717
1718         i = (int) if_nametoindex(iface_name);
1719         if (i <= 0)
1720                 return log_error_errno(errno, "Failed to resolve interface %s: %m", iface_name);
1721
1722         *ifi = i;
1723
1724         return 0;
1725 }
1726
1727 static int setup_bridge(const char veth_name[], int *ifi) {
1728         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1729         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1730         int r, bridge;
1731
1732         if (!arg_private_network)
1733                 return 0;
1734
1735         if (!arg_network_veth)
1736                 return 0;
1737
1738         if (!arg_network_bridge)
1739                 return 0;
1740
1741         bridge = (int) if_nametoindex(arg_network_bridge);
1742         if (bridge <= 0)
1743                 return log_error_errno(errno, "Failed to resolve interface %s: %m", arg_network_bridge);
1744
1745         *ifi = bridge;
1746
1747         r = sd_rtnl_open(&rtnl, 0);
1748         if (r < 0)
1749                 return log_error_errno(r, "Failed to connect to netlink: %m");
1750
1751         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1752         if (r < 0)
1753                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1754
1755         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1756         if (r < 0)
1757                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1758
1759         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1760         if (r < 0)
1761                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1762
1763         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1764         if (r < 0)
1765                 return log_error_errno(r, "Failed to add netlink master field: %m");
1766
1767         r = sd_rtnl_call(rtnl, m, 0, NULL);
1768         if (r < 0)
1769                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1770
1771         return 0;
1772 }
1773
1774 static int parse_interface(struct udev *udev, const char *name) {
1775         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1776         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1777         int ifi;
1778
1779         ifi = (int) if_nametoindex(name);
1780         if (ifi <= 0)
1781                 return log_error_errno(errno, "Failed to resolve interface %s: %m", name);
1782
1783         sprintf(ifi_str, "n%i", ifi);
1784         d = udev_device_new_from_device_id(udev, ifi_str);
1785         if (!d)
1786                 return log_error_errno(errno, "Failed to get udev device for interface %s: %m", name);
1787
1788         if (udev_device_get_is_initialized(d) <= 0) {
1789                 log_error("Network interface %s is not initialized yet.", name);
1790                 return -EBUSY;
1791         }
1792
1793         return ifi;
1794 }
1795
1796 static int move_network_interfaces(pid_t pid) {
1797         _cleanup_udev_unref_ struct udev *udev = NULL;
1798         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1799         char **i;
1800         int r;
1801
1802         if (!arg_private_network)
1803                 return 0;
1804
1805         if (strv_isempty(arg_network_interfaces))
1806                 return 0;
1807
1808         r = sd_rtnl_open(&rtnl, 0);
1809         if (r < 0)
1810                 return log_error_errno(r, "Failed to connect to netlink: %m");
1811
1812         udev = udev_new();
1813         if (!udev) {
1814                 log_error("Failed to connect to udev.");
1815                 return -ENOMEM;
1816         }
1817
1818         STRV_FOREACH(i, arg_network_interfaces) {
1819                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1820                 int ifi;
1821
1822                 ifi = parse_interface(udev, *i);
1823                 if (ifi < 0)
1824                         return ifi;
1825
1826                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1827                 if (r < 0)
1828                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1829
1830                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1831                 if (r < 0)
1832                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1833
1834                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1835                 if (r < 0)
1836                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1837         }
1838
1839         return 0;
1840 }
1841
1842 static int setup_macvlan(pid_t pid) {
1843         _cleanup_udev_unref_ struct udev *udev = NULL;
1844         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1845         unsigned idx = 0;
1846         char **i;
1847         int r;
1848
1849         if (!arg_private_network)
1850                 return 0;
1851
1852         if (strv_isempty(arg_network_macvlan))
1853                 return 0;
1854
1855         r = sd_rtnl_open(&rtnl, 0);
1856         if (r < 0)
1857                 return log_error_errno(r, "Failed to connect to netlink: %m");
1858
1859         udev = udev_new();
1860         if (!udev) {
1861                 log_error("Failed to connect to udev.");
1862                 return -ENOMEM;
1863         }
1864
1865         STRV_FOREACH(i, arg_network_macvlan) {
1866                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1867                 _cleanup_free_ char *n = NULL;
1868                 struct ether_addr mac;
1869                 int ifi;
1870
1871                 ifi = parse_interface(udev, *i);
1872                 if (ifi < 0)
1873                         return ifi;
1874
1875                 r = generate_mac(&mac, MACVLAN_HASH_KEY, idx++);
1876                 if (r < 0)
1877                         return log_error_errno(r, "Failed to create MACVLAN MAC address: %m");
1878
1879                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1880                 if (r < 0)
1881                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1882
1883                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1884                 if (r < 0)
1885                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1886
1887                 n = strappend("mv-", *i);
1888                 if (!n)
1889                         return log_oom();
1890
1891                 strshorten(n, IFNAMSIZ-1);
1892
1893                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1894                 if (r < 0)
1895                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1896
1897                 r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac);
1898                 if (r < 0)
1899                         return log_error_errno(r, "Failed to add netlink MAC address: %m");
1900
1901                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1902                 if (r < 0)
1903                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1904
1905                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1906                 if (r < 0)
1907                         return log_error_errno(r, "Failed to open netlink container: %m");
1908
1909                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1910                 if (r < 0)
1911                         return log_error_errno(r, "Failed to open netlink container: %m");
1912
1913                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1914                 if (r < 0)
1915                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1916
1917                 r = sd_rtnl_message_close_container(m);
1918                 if (r < 0)
1919                         return log_error_errno(r, "Failed to close netlink container: %m");
1920
1921                 r = sd_rtnl_message_close_container(m);
1922                 if (r < 0)
1923                         return log_error_errno(r, "Failed to close netlink container: %m");
1924
1925                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1926                 if (r < 0)
1927                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1928         }
1929
1930         return 0;
1931 }
1932
1933 static int setup_seccomp(void) {
1934
1935 #ifdef HAVE_SECCOMP
1936         static const int blacklist[] = {
1937                 SCMP_SYS(kexec_load),
1938                 SCMP_SYS(open_by_handle_at),
1939                 SCMP_SYS(init_module),
1940                 SCMP_SYS(finit_module),
1941                 SCMP_SYS(delete_module),
1942                 SCMP_SYS(iopl),
1943                 SCMP_SYS(ioperm),
1944                 SCMP_SYS(swapon),
1945                 SCMP_SYS(swapoff),
1946         };
1947
1948         scmp_filter_ctx seccomp;
1949         unsigned i;
1950         int r;
1951
1952         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1953         if (!seccomp)
1954                 return log_oom();
1955
1956         r = seccomp_add_secondary_archs(seccomp);
1957         if (r < 0) {
1958                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1959                 goto finish;
1960         }
1961
1962         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1963                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1964                 if (r == -EFAULT)
1965                         continue; /* unknown syscall */
1966                 if (r < 0) {
1967                         log_error_errno(r, "Failed to block syscall: %m");
1968                         goto finish;
1969                 }
1970         }
1971
1972         /*
1973            Audit is broken in containers, much of the userspace audit
1974            hookup will fail if running inside a container. We don't
1975            care and just turn off creation of audit sockets.
1976
1977            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1978            with EAFNOSUPPORT which audit userspace uses as indication
1979            that audit is disabled in the kernel.
1980          */
1981
1982         r = seccomp_rule_add(
1983                         seccomp,
1984                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1985                         SCMP_SYS(socket),
1986                         2,
1987                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1988                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1989         if (r < 0) {
1990                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
1991                 goto finish;
1992         }
1993
1994         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1995         if (r < 0) {
1996                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
1997                 goto finish;
1998         }
1999
2000         r = seccomp_load(seccomp);
2001         if (r < 0)
2002                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2003
2004 finish:
2005         seccomp_release(seccomp);
2006         return r;
2007 #else
2008         return 0;
2009 #endif
2010
2011 }
2012
2013 static int setup_image(char **device_path, int *loop_nr) {
2014         struct loop_info64 info = {
2015                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2016         };
2017         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2018         _cleanup_free_ char* loopdev = NULL;
2019         struct stat st;
2020         int r, nr;
2021
2022         assert(device_path);
2023         assert(loop_nr);
2024
2025         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2026         if (fd < 0)
2027                 return log_error_errno(errno, "Failed to open %s: %m", arg_image);
2028
2029         if (fstat(fd, &st) < 0)
2030                 return log_error_errno(errno, "Failed to stat %s: %m", arg_image);
2031
2032         if (S_ISBLK(st.st_mode)) {
2033                 char *p;
2034
2035                 p = strdup(arg_image);
2036                 if (!p)
2037                         return log_oom();
2038
2039                 *device_path = p;
2040
2041                 *loop_nr = -1;
2042
2043                 r = fd;
2044                 fd = -1;
2045
2046                 return r;
2047         }
2048
2049         if (!S_ISREG(st.st_mode)) {
2050                 log_error_errno(errno, "%s is not a regular file or block device: %m", arg_image);
2051                 return -EINVAL;
2052         }
2053
2054         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2055         if (control < 0)
2056                 return log_error_errno(errno, "Failed to open /dev/loop-control: %m");
2057
2058         nr = ioctl(control, LOOP_CTL_GET_FREE);
2059         if (nr < 0)
2060                 return log_error_errno(errno, "Failed to allocate loop device: %m");
2061
2062         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2063                 return log_oom();
2064
2065         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2066         if (loop < 0)
2067                 return log_error_errno(errno, "Failed to open loop device %s: %m", loopdev);
2068
2069         if (ioctl(loop, LOOP_SET_FD, fd) < 0)
2070                 return log_error_errno(errno, "Failed to set loopback file descriptor on %s: %m", loopdev);
2071
2072         if (arg_read_only)
2073                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2074
2075         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0)
2076                 return log_error_errno(errno, "Failed to set loopback settings on %s: %m", loopdev);
2077
2078         *device_path = loopdev;
2079         loopdev = NULL;
2080
2081         *loop_nr = nr;
2082
2083         r = loop;
2084         loop = -1;
2085
2086         return r;
2087 }
2088
2089 static int dissect_image(
2090                 int fd,
2091                 char **root_device, bool *root_device_rw,
2092                 char **home_device, bool *home_device_rw,
2093                 char **srv_device, bool *srv_device_rw,
2094                 bool *secondary) {
2095
2096 #ifdef HAVE_BLKID
2097         int home_nr = -1, srv_nr = -1;
2098 #ifdef GPT_ROOT_NATIVE
2099         int root_nr = -1;
2100 #endif
2101 #ifdef GPT_ROOT_SECONDARY
2102         int secondary_root_nr = -1;
2103 #endif
2104
2105         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2106         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2107         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2108         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2109         _cleanup_udev_unref_ struct udev *udev = NULL;
2110         struct udev_list_entry *first, *item;
2111         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2112         const char *pttype = NULL;
2113         blkid_partlist pl;
2114         struct stat st;
2115         int r;
2116
2117         assert(fd >= 0);
2118         assert(root_device);
2119         assert(home_device);
2120         assert(srv_device);
2121         assert(secondary);
2122
2123         b = blkid_new_probe();
2124         if (!b)
2125                 return log_oom();
2126
2127         errno = 0;
2128         r = blkid_probe_set_device(b, fd, 0, 0);
2129         if (r != 0) {
2130                 if (errno == 0)
2131                         return log_oom();
2132
2133                 log_error_errno(errno, "Failed to set device on blkid probe: %m");
2134                 return -errno;
2135         }
2136
2137         blkid_probe_enable_partitions(b, 1);
2138         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2139
2140         errno = 0;
2141         r = blkid_do_safeprobe(b);
2142         if (r == -2 || r == 1) {
2143                 log_error("Failed to identify any partition table on %s.\n"
2144                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2145                 return -EINVAL;
2146         } else if (r != 0) {
2147                 if (errno == 0)
2148                         errno = EIO;
2149                 log_error_errno(errno, "Failed to probe: %m");
2150                 return -errno;
2151         }
2152
2153         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2154         if (!streq_ptr(pttype, "gpt")) {
2155                 log_error("Image %s does not carry a GUID Partition Table.\n"
2156                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2157                 return -EINVAL;
2158         }
2159
2160         errno = 0;
2161         pl = blkid_probe_get_partitions(b);
2162         if (!pl) {
2163                 if (errno == 0)
2164                         return log_oom();
2165
2166                 log_error("Failed to list partitions of %s", arg_image);
2167                 return -errno;
2168         }
2169
2170         udev = udev_new();
2171         if (!udev)
2172                 return log_oom();
2173
2174         if (fstat(fd, &st) < 0)
2175                 return log_error_errno(errno, "Failed to stat block device: %m");
2176
2177         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2178         if (!d)
2179                 return log_oom();
2180
2181         e = udev_enumerate_new(udev);
2182         if (!e)
2183                 return log_oom();
2184
2185         r = udev_enumerate_add_match_parent(e, d);
2186         if (r < 0)
2187                 return log_oom();
2188
2189         r = udev_enumerate_scan_devices(e);
2190         if (r < 0)
2191                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2192
2193         first = udev_enumerate_get_list_entry(e);
2194         udev_list_entry_foreach(item, first) {
2195                 _cleanup_udev_device_unref_ struct udev_device *q;
2196                 const char *stype, *node;
2197                 unsigned long long flags;
2198                 sd_id128_t type_id;
2199                 blkid_partition pp;
2200                 dev_t qn;
2201                 int nr;
2202
2203                 errno = 0;
2204                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2205                 if (!q) {
2206                         if (!errno)
2207                                 errno = ENOMEM;
2208
2209                         log_error_errno(errno, "Failed to get partition device of %s: %m", arg_image);
2210                         return -errno;
2211                 }
2212
2213                 qn = udev_device_get_devnum(q);
2214                 if (major(qn) == 0)
2215                         continue;
2216
2217                 if (st.st_rdev == qn)
2218                         continue;
2219
2220                 node = udev_device_get_devnode(q);
2221                 if (!node)
2222                         continue;
2223
2224                 pp = blkid_partlist_devno_to_partition(pl, qn);
2225                 if (!pp)
2226                         continue;
2227
2228                 flags = blkid_partition_get_flags(pp);
2229                 if (flags & GPT_FLAG_NO_AUTO)
2230                         continue;
2231
2232                 nr = blkid_partition_get_partno(pp);
2233                 if (nr < 0)
2234                         continue;
2235
2236                 stype = blkid_partition_get_type_string(pp);
2237                 if (!stype)
2238                         continue;
2239
2240                 if (sd_id128_from_string(stype, &type_id) < 0)
2241                         continue;
2242
2243                 if (sd_id128_equal(type_id, GPT_HOME)) {
2244
2245                         if (home && nr >= home_nr)
2246                                 continue;
2247
2248                         home_nr = nr;
2249                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2250
2251                         free(home);
2252                         home = strdup(node);
2253                         if (!home)
2254                                 return log_oom();
2255                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2256
2257                         if (srv && nr >= srv_nr)
2258                                 continue;
2259
2260                         srv_nr = nr;
2261                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2262
2263                         free(srv);
2264                         srv = strdup(node);
2265                         if (!srv)
2266                                 return log_oom();
2267                 }
2268 #ifdef GPT_ROOT_NATIVE
2269                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2270
2271                         if (root && nr >= root_nr)
2272                                 continue;
2273
2274                         root_nr = nr;
2275                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2276
2277                         free(root);
2278                         root = strdup(node);
2279                         if (!root)
2280                                 return log_oom();
2281                 }
2282 #endif
2283 #ifdef GPT_ROOT_SECONDARY
2284                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2285
2286                         if (secondary_root && nr >= secondary_root_nr)
2287                                 continue;
2288
2289                         secondary_root_nr = nr;
2290                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2291
2292
2293                         free(secondary_root);
2294                         secondary_root = strdup(node);
2295                         if (!secondary_root)
2296                                 return log_oom();
2297                 }
2298 #endif
2299         }
2300
2301         if (!root && !secondary_root) {
2302                 log_error("Failed to identify root partition in disk image %s.\n"
2303                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2304                 return -EINVAL;
2305         }
2306
2307         if (root) {
2308                 *root_device = root;
2309                 root = NULL;
2310
2311                 *root_device_rw = root_rw;
2312                 *secondary = false;
2313         } else if (secondary_root) {
2314                 *root_device = secondary_root;
2315                 secondary_root = NULL;
2316
2317                 *root_device_rw = secondary_root_rw;
2318                 *secondary = true;
2319         }
2320
2321         if (home) {
2322                 *home_device = home;
2323                 home = NULL;
2324
2325                 *home_device_rw = home_rw;
2326         }
2327
2328         if (srv) {
2329                 *srv_device = srv;
2330                 srv = NULL;
2331
2332                 *srv_device_rw = srv_rw;
2333         }
2334
2335         return 0;
2336 #else
2337         log_error("--image= is not supported, compiled without blkid support.");
2338         return -ENOTSUP;
2339 #endif
2340 }
2341
2342 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2343 #ifdef HAVE_BLKID
2344         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2345         const char *fstype, *p;
2346         int r;
2347
2348         assert(what);
2349         assert(where);
2350
2351         if (arg_read_only)
2352                 rw = false;
2353
2354         if (directory)
2355                 p = strappenda(where, directory);
2356         else
2357                 p = where;
2358
2359         errno = 0;
2360         b = blkid_new_probe_from_filename(what);
2361         if (!b) {
2362                 if (errno == 0)
2363                         return log_oom();
2364                 log_error_errno(errno, "Failed to allocate prober for %s: %m", what);
2365                 return -errno;
2366         }
2367
2368         blkid_probe_enable_superblocks(b, 1);
2369         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2370
2371         errno = 0;
2372         r = blkid_do_safeprobe(b);
2373         if (r == -1 || r == 1) {
2374                 log_error("Cannot determine file system type of %s", what);
2375                 return -EINVAL;
2376         } else if (r != 0) {
2377                 if (errno == 0)
2378                         errno = EIO;
2379                 log_error_errno(errno, "Failed to probe %s: %m", what);
2380                 return -errno;
2381         }
2382
2383         errno = 0;
2384         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2385                 if (errno == 0)
2386                         errno = EINVAL;
2387                 log_error("Failed to determine file system type of %s", what);
2388                 return -errno;
2389         }
2390
2391         if (streq(fstype, "crypto_LUKS")) {
2392                 log_error("nspawn currently does not support LUKS disk images.");
2393                 return -ENOTSUP;
2394         }
2395
2396         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0)
2397                 return log_error_errno(errno, "Failed to mount %s: %m", what);
2398
2399         return 0;
2400 #else
2401         log_error("--image= is not supported, compiled without blkid support.");
2402         return -ENOTSUP;
2403 #endif
2404 }
2405
2406 static int mount_devices(
2407                 const char *where,
2408                 const char *root_device, bool root_device_rw,
2409                 const char *home_device, bool home_device_rw,
2410                 const char *srv_device, bool srv_device_rw) {
2411         int r;
2412
2413         assert(where);
2414
2415         if (root_device) {
2416                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2417                 if (r < 0)
2418                         return log_error_errno(r, "Failed to mount root directory: %m");
2419         }
2420
2421         if (home_device) {
2422                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2423                 if (r < 0)
2424                         return log_error_errno(r, "Failed to mount home directory: %m");
2425         }
2426
2427         if (srv_device) {
2428                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2429                 if (r < 0)
2430                         return log_error_errno(r, "Failed to mount server data directory: %m");
2431         }
2432
2433         return 0;
2434 }
2435
2436 static void loop_remove(int nr, int *image_fd) {
2437         _cleanup_close_ int control = -1;
2438         int r;
2439
2440         if (nr < 0)
2441                 return;
2442
2443         if (image_fd && *image_fd >= 0) {
2444                 r = ioctl(*image_fd, LOOP_CLR_FD);
2445                 if (r < 0)
2446                         log_warning_errno(errno, "Failed to close loop image: %m");
2447                 *image_fd = safe_close(*image_fd);
2448         }
2449
2450         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2451         if (control < 0) {
2452                 log_warning_errno(errno, "Failed to open /dev/loop-control: %m");
2453                 return;
2454         }
2455
2456         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2457         if (r < 0)
2458                 log_warning_errno(errno, "Failed to remove loop %d: %m", nr);
2459 }
2460
2461 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2462         int pipe_fds[2];
2463         pid_t pid;
2464
2465         assert(database);
2466         assert(key);
2467         assert(rpid);
2468
2469         if (pipe2(pipe_fds, O_CLOEXEC) < 0)
2470                 return log_error_errno(errno, "Failed to allocate pipe: %m");
2471
2472         pid = fork();
2473         if (pid < 0)
2474                 return log_error_errno(errno, "Failed to fork getent child: %m");
2475         else if (pid == 0) {
2476                 int nullfd;
2477                 char *empty_env = NULL;
2478
2479                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2480                         _exit(EXIT_FAILURE);
2481
2482                 if (pipe_fds[0] > 2)
2483                         safe_close(pipe_fds[0]);
2484                 if (pipe_fds[1] > 2)
2485                         safe_close(pipe_fds[1]);
2486
2487                 nullfd = open("/dev/null", O_RDWR);
2488                 if (nullfd < 0)
2489                         _exit(EXIT_FAILURE);
2490
2491                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2492                         _exit(EXIT_FAILURE);
2493
2494                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2495                         _exit(EXIT_FAILURE);
2496
2497                 if (nullfd > 2)
2498                         safe_close(nullfd);
2499
2500                 reset_all_signal_handlers();
2501                 close_all_fds(NULL, 0);
2502
2503                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2504                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2505                 _exit(EXIT_FAILURE);
2506         }
2507
2508         pipe_fds[1] = safe_close(pipe_fds[1]);
2509
2510         *rpid = pid;
2511
2512         return pipe_fds[0];
2513 }
2514
2515 static int change_uid_gid(char **_home) {
2516         char line[LINE_MAX], *x, *u, *g, *h;
2517         const char *word, *state;
2518         _cleanup_free_ uid_t *uids = NULL;
2519         _cleanup_free_ char *home = NULL;
2520         _cleanup_fclose_ FILE *f = NULL;
2521         _cleanup_close_ int fd = -1;
2522         unsigned n_uids = 0;
2523         size_t sz = 0, l;
2524         uid_t uid;
2525         gid_t gid;
2526         pid_t pid;
2527         int r;
2528
2529         assert(_home);
2530
2531         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2532                 /* Reset everything fully to 0, just in case */
2533
2534                 if (setgroups(0, NULL) < 0)
2535                         return log_error_errno(errno, "setgroups() failed: %m");
2536
2537                 if (setresgid(0, 0, 0) < 0)
2538                         return log_error_errno(errno, "setregid() failed: %m");
2539
2540                 if (setresuid(0, 0, 0) < 0)
2541                         return log_error_errno(errno, "setreuid() failed: %m");
2542
2543                 *_home = NULL;
2544                 return 0;
2545         }
2546
2547         /* First, get user credentials */
2548         fd = spawn_getent("passwd", arg_user, &pid);
2549         if (fd < 0)
2550                 return fd;
2551
2552         f = fdopen(fd, "r");
2553         if (!f)
2554                 return log_oom();
2555         fd = -1;
2556
2557         if (!fgets(line, sizeof(line), f)) {
2558
2559                 if (!ferror(f)) {
2560                         log_error("Failed to resolve user %s.", arg_user);
2561                         return -ESRCH;
2562                 }
2563
2564                 log_error_errno(errno, "Failed to read from getent: %m");
2565                 return -errno;
2566         }
2567
2568         truncate_nl(line);
2569
2570         wait_for_terminate_and_warn("getent passwd", pid, true);
2571
2572         x = strchr(line, ':');
2573         if (!x) {
2574                 log_error("/etc/passwd entry has invalid user field.");
2575                 return -EIO;
2576         }
2577
2578         u = strchr(x+1, ':');
2579         if (!u) {
2580                 log_error("/etc/passwd entry has invalid password field.");
2581                 return -EIO;
2582         }
2583
2584         u++;
2585         g = strchr(u, ':');
2586         if (!g) {
2587                 log_error("/etc/passwd entry has invalid UID field.");
2588                 return -EIO;
2589         }
2590
2591         *g = 0;
2592         g++;
2593         x = strchr(g, ':');
2594         if (!x) {
2595                 log_error("/etc/passwd entry has invalid GID field.");
2596                 return -EIO;
2597         }
2598
2599         *x = 0;
2600         h = strchr(x+1, ':');
2601         if (!h) {
2602                 log_error("/etc/passwd entry has invalid GECOS field.");
2603                 return -EIO;
2604         }
2605
2606         h++;
2607         x = strchr(h, ':');
2608         if (!x) {
2609                 log_error("/etc/passwd entry has invalid home directory field.");
2610                 return -EIO;
2611         }
2612
2613         *x = 0;
2614
2615         r = parse_uid(u, &uid);
2616         if (r < 0) {
2617                 log_error("Failed to parse UID of user.");
2618                 return -EIO;
2619         }
2620
2621         r = parse_gid(g, &gid);
2622         if (r < 0) {
2623                 log_error("Failed to parse GID of user.");
2624                 return -EIO;
2625         }
2626
2627         home = strdup(h);
2628         if (!home)
2629                 return log_oom();
2630
2631         /* Second, get group memberships */
2632         fd = spawn_getent("initgroups", arg_user, &pid);
2633         if (fd < 0)
2634                 return fd;
2635
2636         fclose(f);
2637         f = fdopen(fd, "r");
2638         if (!f)
2639                 return log_oom();
2640         fd = -1;
2641
2642         if (!fgets(line, sizeof(line), f)) {
2643                 if (!ferror(f)) {
2644                         log_error("Failed to resolve user %s.", arg_user);
2645                         return -ESRCH;
2646                 }
2647
2648                 log_error_errno(errno, "Failed to read from getent: %m");
2649                 return -errno;
2650         }
2651
2652         truncate_nl(line);
2653
2654         wait_for_terminate_and_warn("getent initgroups", pid, true);
2655
2656         /* Skip over the username and subsequent separator whitespace */
2657         x = line;
2658         x += strcspn(x, WHITESPACE);
2659         x += strspn(x, WHITESPACE);
2660
2661         FOREACH_WORD(word, l, x, state) {
2662                 char c[l+1];
2663
2664                 memcpy(c, word, l);
2665                 c[l] = 0;
2666
2667                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2668                         return log_oom();
2669
2670                 r = parse_uid(c, &uids[n_uids++]);
2671                 if (r < 0) {
2672                         log_error("Failed to parse group data from getent.");
2673                         return -EIO;
2674                 }
2675         }
2676
2677         r = mkdir_parents(home, 0775);
2678         if (r < 0)
2679                 return log_error_errno(r, "Failed to make home root directory: %m");
2680
2681         r = mkdir_safe(home, 0755, uid, gid);
2682         if (r < 0 && r != -EEXIST)
2683                 return log_error_errno(r, "Failed to make home directory: %m");
2684
2685         fchown(STDIN_FILENO, uid, gid);
2686         fchown(STDOUT_FILENO, uid, gid);
2687         fchown(STDERR_FILENO, uid, gid);
2688
2689         if (setgroups(n_uids, uids) < 0)
2690                 return log_error_errno(errno, "Failed to set auxiliary groups: %m");
2691
2692         if (setresgid(gid, gid, gid) < 0)
2693                 return log_error_errno(errno, "setregid() failed: %m");
2694
2695         if (setresuid(uid, uid, uid) < 0)
2696                 return log_error_errno(errno, "setreuid() failed: %m");
2697
2698         if (_home) {
2699                 *_home = home;
2700                 home = NULL;
2701         }
2702
2703         return 0;
2704 }
2705
2706 /*
2707  * Return values:
2708  * < 0 : wait_for_terminate() failed to get the state of the
2709  *       container, the container was terminated by a signal, or
2710  *       failed for an unknown reason.  No change is made to the
2711  *       container argument.
2712  * > 0 : The program executed in the container terminated with an
2713  *       error.  The exit code of the program executed in the
2714  *       container is returned.  The container argument has been set
2715  *       to CONTAINER_TERMINATED.
2716  *   0 : The container is being rebooted, has been shut down or exited
2717  *       successfully.  The container argument has been set to either
2718  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2719  *
2720  * That is, success is indicated by a return value of zero, and an
2721  * error is indicated by a non-zero value.
2722  */
2723 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2724         siginfo_t status;
2725         int r;
2726
2727         r = wait_for_terminate(pid, &status);
2728         if (r < 0)
2729                 return log_warning_errno(r, "Failed to wait for container: %m");
2730
2731         switch (status.si_code) {
2732
2733         case CLD_EXITED:
2734                 if (status.si_status == 0) {
2735                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2736
2737                 } else
2738                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2739
2740                 *container = CONTAINER_TERMINATED;
2741                 return status.si_status;
2742
2743         case CLD_KILLED:
2744                 if (status.si_status == SIGINT) {
2745
2746                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2747                         *container = CONTAINER_TERMINATED;
2748                         return 0;
2749
2750                 } else if (status.si_status == SIGHUP) {
2751
2752                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2753                         *container = CONTAINER_REBOOTED;
2754                         return 0;
2755                 }
2756
2757                 /* CLD_KILLED fallthrough */
2758
2759         case CLD_DUMPED:
2760                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2761                 return -EIO;
2762
2763         default:
2764                 log_error("Container %s failed due to unknown reason.", arg_machine);
2765                 return -EIO;
2766         }
2767
2768         return r;
2769 }
2770
2771 static void nop_handler(int sig) {}
2772
2773 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2774         pid_t pid;
2775
2776         pid = PTR_TO_UINT32(userdata);
2777         if (pid > 0) {
2778                 if (kill(pid, SIGRTMIN+3) >= 0) {
2779                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2780                         sd_event_source_set_userdata(s, NULL);
2781                         return 0;
2782                 }
2783         }
2784
2785         sd_event_exit(sd_event_source_get_event(s), 0);
2786         return 0;
2787 }
2788
2789 int main(int argc, char *argv[]) {
2790
2791         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2792         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2793         _cleanup_close_ int master = -1, image_fd = -1;
2794         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2795         _cleanup_fdset_free_ FDSet *fds = NULL;
2796         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2797         const char *console = NULL;
2798         char veth_name[IFNAMSIZ];
2799         bool secondary = false;
2800         sigset_t mask, mask_chld;
2801         pid_t pid = 0;
2802
2803         log_parse_environment();
2804         log_open();
2805
2806         k = parse_argv(argc, argv);
2807         if (k < 0)
2808                 goto finish;
2809         else if (k == 0) {
2810                 r = EXIT_SUCCESS;
2811                 goto finish;
2812         }
2813
2814         if (!arg_image) {
2815                 if (arg_directory) {
2816                         char *p;
2817
2818                         p = path_make_absolute_cwd(arg_directory);
2819                         free(arg_directory);
2820                         arg_directory = p;
2821                 } else
2822                         arg_directory = get_current_dir_name();
2823
2824                 if (!arg_directory) {
2825                         log_error("Failed to determine path, please use -D.");
2826                         goto finish;
2827                 }
2828                 path_kill_slashes(arg_directory);
2829         }
2830
2831         if (!arg_machine) {
2832                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2833                 if (!arg_machine) {
2834                         log_oom();
2835                         goto finish;
2836                 }
2837
2838                 hostname_cleanup(arg_machine, false);
2839                 if (isempty(arg_machine)) {
2840                         log_error("Failed to determine machine name automatically, please use -M.");
2841                         goto finish;
2842                 }
2843         }
2844
2845         if (geteuid() != 0) {
2846                 log_error("Need to be root.");
2847                 goto finish;
2848         }
2849
2850         if (sd_booted() <= 0) {
2851                 log_error("Not running on a systemd system.");
2852                 goto finish;
2853         }
2854
2855         log_close();
2856         n_fd_passed = sd_listen_fds(false);
2857         if (n_fd_passed > 0) {
2858                 k = fdset_new_listen_fds(&fds, false);
2859                 if (k < 0) {
2860                         log_error_errno(k, "Failed to collect file descriptors: %m");
2861                         goto finish;
2862                 }
2863         }
2864         fdset_close_others(fds);
2865         log_open();
2866
2867         if (arg_directory) {
2868                 if (path_equal(arg_directory, "/")) {
2869                         log_error("Spawning container on root directory not supported.");
2870                         goto finish;
2871                 }
2872
2873                 if (arg_boot) {
2874                         if (path_is_os_tree(arg_directory) <= 0) {
2875                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2876                                 goto finish;
2877                         }
2878                 } else {
2879                         const char *p;
2880
2881                         p = strappenda(arg_directory,
2882                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2883                         if (access(p, F_OK) < 0) {
2884                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2885                                 goto finish;
2886
2887                         }
2888                 }
2889         } else {
2890                 char template[] = "/tmp/nspawn-root-XXXXXX";
2891
2892                 if (!mkdtemp(template)) {
2893                         log_error_errno(errno, "Failed to create temporary directory: %m");
2894                         r = -errno;
2895                         goto finish;
2896                 }
2897
2898                 arg_directory = strdup(template);
2899                 if (!arg_directory) {
2900                         r = log_oom();
2901                         goto finish;
2902                 }
2903
2904                 image_fd = setup_image(&device_path, &loop_nr);
2905                 if (image_fd < 0) {
2906                         r = image_fd;
2907                         goto finish;
2908                 }
2909
2910                 r = dissect_image(image_fd,
2911                                   &root_device, &root_device_rw,
2912                                   &home_device, &home_device_rw,
2913                                   &srv_device, &srv_device_rw,
2914                                   &secondary);
2915                 if (r < 0)
2916                         goto finish;
2917         }
2918
2919         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2920         if (master < 0) {
2921                 log_error_errno(errno, "Failed to acquire pseudo tty: %m");
2922                 goto finish;
2923         }
2924
2925         console = ptsname(master);
2926         if (!console) {
2927                 log_error_errno(errno, "Failed to determine tty name: %m");
2928                 goto finish;
2929         }
2930
2931         if (!arg_quiet)
2932                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2933                          arg_machine, arg_image ? arg_image : arg_directory);
2934
2935         if (unlockpt(master) < 0) {
2936                 log_error_errno(errno, "Failed to unlock tty: %m");
2937                 goto finish;
2938         }
2939
2940         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2941                 log_error_errno(errno, "Failed to create kmsg socket pair: %m");
2942                 goto finish;
2943         }
2944
2945         sd_notify(false,
2946                   "READY=1\n"
2947                   "STATUS=Container running.");
2948
2949         assert_se(sigemptyset(&mask) == 0);
2950         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2951         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2952
2953         assert_se(sigemptyset(&mask_chld) == 0);
2954         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
2955
2956         for (;;) {
2957                 ContainerStatus container_status;
2958                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
2959                 struct sigaction sa = {
2960                         .sa_handler = nop_handler,
2961                         .sa_flags = SA_NOCLDSTOP,
2962                 };
2963
2964                 r = barrier_create(&barrier);
2965                 if (r < 0) {
2966                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
2967                         goto finish;
2968                 }
2969
2970                 /* Child can be killed before execv(), so handle SIGCHLD
2971                  * in order to interrupt parent's blocking calls and
2972                  * give it a chance to call wait() and terminate. */
2973                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
2974                 if (r < 0) {
2975                         log_error_errno(errno, "Failed to change the signal mask: %m");
2976                         goto finish;
2977                 }
2978
2979                 r = sigaction(SIGCHLD, &sa, NULL);
2980                 if (r < 0) {
2981                         log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
2982                         goto finish;
2983                 }
2984
2985                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
2986                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2987                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
2988                 if (pid < 0) {
2989                         if (errno == EINVAL)
2990                                 log_error_errno(errno, "clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2991                         else
2992                                 log_error_errno(errno, "clone() failed: %m");
2993
2994                         r = pid;
2995                         goto finish;
2996                 }
2997
2998                 if (pid == 0) {
2999                         /* child */
3000                         _cleanup_free_ char *home = NULL;
3001                         unsigned n_env = 2;
3002                         const char *envp[] = {
3003                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3004                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3005                                 NULL, /* TERM */
3006                                 NULL, /* HOME */
3007                                 NULL, /* USER */
3008                                 NULL, /* LOGNAME */
3009                                 NULL, /* container_uuid */
3010                                 NULL, /* LISTEN_FDS */
3011                                 NULL, /* LISTEN_PID */
3012                                 NULL
3013                         };
3014                         char **env_use;
3015
3016                         barrier_set_role(&barrier, BARRIER_CHILD);
3017
3018                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3019                         if (envp[n_env])
3020                                 n_env ++;
3021
3022                         master = safe_close(master);
3023
3024                         close_nointr(STDIN_FILENO);
3025                         close_nointr(STDOUT_FILENO);
3026                         close_nointr(STDERR_FILENO);
3027
3028                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3029
3030                         reset_all_signal_handlers();
3031                         reset_signal_mask();
3032
3033                         k = open_terminal(console, O_RDWR);
3034                         if (k != STDIN_FILENO) {
3035                                 if (k >= 0) {
3036                                         safe_close(k);
3037                                         k = -EINVAL;
3038                                 }
3039
3040                                 log_error_errno(k, "Failed to open console: %m");
3041                                 _exit(EXIT_FAILURE);
3042                         }
3043
3044                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3045                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3046                                 log_error_errno(errno, "Failed to duplicate console: %m");
3047                                 _exit(EXIT_FAILURE);
3048                         }
3049
3050                         if (setsid() < 0) {
3051                                 log_error_errno(errno, "setsid() failed: %m");
3052                                 _exit(EXIT_FAILURE);
3053                         }
3054
3055                         if (reset_audit_loginuid() < 0)
3056                                 _exit(EXIT_FAILURE);
3057
3058                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3059                                 log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3060                                 _exit(EXIT_FAILURE);
3061                         }
3062
3063                         /* Mark everything as slave, so that we still
3064                          * receive mounts from the real root, but don't
3065                          * propagate mounts to the real root. */
3066                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
3067                                 log_error_errno(errno, "MS_SLAVE|MS_REC failed: %m");
3068                                 _exit(EXIT_FAILURE);
3069                         }
3070
3071                         if (mount_devices(arg_directory,
3072                                           root_device, root_device_rw,
3073                                           home_device, home_device_rw,
3074                                           srv_device, srv_device_rw) < 0)
3075                                 _exit(EXIT_FAILURE);
3076
3077                         /* Turn directory into bind mount */
3078                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
3079                                 log_error_errno(errno, "Failed to make bind mount: %m");
3080                                 _exit(EXIT_FAILURE);
3081                         }
3082
3083                         r = setup_volatile(arg_directory);
3084                         if (r < 0)
3085                                 _exit(EXIT_FAILURE);
3086
3087                         if (setup_volatile_state(arg_directory) < 0)
3088                                 _exit(EXIT_FAILURE);
3089
3090                         r = base_filesystem_create(arg_directory);
3091                         if (r < 0)
3092                                 _exit(EXIT_FAILURE);
3093
3094                         if (arg_read_only) {
3095                                 k = bind_remount_recursive(arg_directory, true);
3096                                 if (k < 0) {
3097                                         log_error_errno(k, "Failed to make tree read-only: %m");
3098                                         _exit(EXIT_FAILURE);
3099                                 }
3100                         }
3101
3102                         if (mount_all(arg_directory) < 0)
3103                                 _exit(EXIT_FAILURE);
3104
3105                         if (copy_devnodes(arg_directory) < 0)
3106                                 _exit(EXIT_FAILURE);
3107