chiark / gitweb /
treewide: yet more log_*_errno + return simplifications
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <net/if.h>
44 #include <linux/veth.h>
45 #include <sys/personality.h>
46 #include <linux/loop.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #ifdef HAVE_BLKID
57 #include <blkid/blkid.h>
58 #endif
59
60 #include "sd-daemon.h"
61 #include "sd-bus.h"
62 #include "sd-id128.h"
63 #include "sd-rtnl.h"
64 #include "log.h"
65 #include "util.h"
66 #include "mkdir.h"
67 #include "macro.h"
68 #include "audit.h"
69 #include "missing.h"
70 #include "cgroup-util.h"
71 #include "strv.h"
72 #include "path-util.h"
73 #include "loopback-setup.h"
74 #include "dev-setup.h"
75 #include "fdset.h"
76 #include "build.h"
77 #include "fileio.h"
78 #include "bus-util.h"
79 #include "bus-error.h"
80 #include "ptyfwd.h"
81 #include "bus-kernel.h"
82 #include "env-util.h"
83 #include "def.h"
84 #include "rtnl-util.h"
85 #include "udev-util.h"
86 #include "blkid-util.h"
87 #include "gpt.h"
88 #include "siphash24.h"
89 #include "copy.h"
90 #include "base-filesystem.h"
91 #include "barrier.h"
92 #include "event-util.h"
93
94 #ifdef HAVE_SECCOMP
95 #include "seccomp-util.h"
96 #endif
97
98 typedef enum ContainerStatus {
99         CONTAINER_TERMINATED,
100         CONTAINER_REBOOTED
101 } ContainerStatus;
102
103 typedef enum LinkJournal {
104         LINK_NO,
105         LINK_AUTO,
106         LINK_HOST,
107         LINK_GUEST
108 } LinkJournal;
109
110 typedef enum Volatile {
111         VOLATILE_NO,
112         VOLATILE_YES,
113         VOLATILE_STATE,
114 } Volatile;
115
116 static char *arg_directory = NULL;
117 static char *arg_user = NULL;
118 static sd_id128_t arg_uuid = {};
119 static char *arg_machine = NULL;
120 static const char *arg_selinux_context = NULL;
121 static const char *arg_selinux_apifs_context = NULL;
122 static const char *arg_slice = NULL;
123 static bool arg_private_network = false;
124 static bool arg_read_only = false;
125 static bool arg_boot = false;
126 static LinkJournal arg_link_journal = LINK_AUTO;
127 static bool arg_link_journal_try = false;
128 static uint64_t arg_retain =
129         (1ULL << CAP_CHOWN) |
130         (1ULL << CAP_DAC_OVERRIDE) |
131         (1ULL << CAP_DAC_READ_SEARCH) |
132         (1ULL << CAP_FOWNER) |
133         (1ULL << CAP_FSETID) |
134         (1ULL << CAP_IPC_OWNER) |
135         (1ULL << CAP_KILL) |
136         (1ULL << CAP_LEASE) |
137         (1ULL << CAP_LINUX_IMMUTABLE) |
138         (1ULL << CAP_NET_BIND_SERVICE) |
139         (1ULL << CAP_NET_BROADCAST) |
140         (1ULL << CAP_NET_RAW) |
141         (1ULL << CAP_SETGID) |
142         (1ULL << CAP_SETFCAP) |
143         (1ULL << CAP_SETPCAP) |
144         (1ULL << CAP_SETUID) |
145         (1ULL << CAP_SYS_ADMIN) |
146         (1ULL << CAP_SYS_CHROOT) |
147         (1ULL << CAP_SYS_NICE) |
148         (1ULL << CAP_SYS_PTRACE) |
149         (1ULL << CAP_SYS_TTY_CONFIG) |
150         (1ULL << CAP_SYS_RESOURCE) |
151         (1ULL << CAP_SYS_BOOT) |
152         (1ULL << CAP_AUDIT_WRITE) |
153         (1ULL << CAP_AUDIT_CONTROL) |
154         (1ULL << CAP_MKNOD);
155 static char **arg_bind = NULL;
156 static char **arg_bind_ro = NULL;
157 static char **arg_tmpfs = NULL;
158 static char **arg_setenv = NULL;
159 static bool arg_quiet = false;
160 static bool arg_share_system = false;
161 static bool arg_register = true;
162 static bool arg_keep_unit = false;
163 static char **arg_network_interfaces = NULL;
164 static char **arg_network_macvlan = NULL;
165 static bool arg_network_veth = false;
166 static const char *arg_network_bridge = NULL;
167 static unsigned long arg_personality = 0xffffffffLU;
168 static const char *arg_image = NULL;
169 static Volatile arg_volatile = VOLATILE_NO;
170
171 static void help(void) {
172         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
173                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
174                "  -h --help                 Show this help\n"
175                "     --version              Print version string\n"
176                "  -q --quiet                Do not show status information\n"
177                "  -D --directory=PATH       Root directory for the container\n"
178                "  -i --image=PATH           File system device or image for the container\n"
179                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
180                "  -u --user=USER            Run the command under specified user or uid\n"
181                "  -M --machine=NAME         Set the machine name for the container\n"
182                "     --uuid=UUID            Set a specific machine UUID for the container\n"
183                "  -S --slice=SLICE          Place the container in the specified slice\n"
184                "     --private-network      Disable network in container\n"
185                "     --network-interface=INTERFACE\n"
186                "                            Assign an existing network interface to the\n"
187                "                            container\n"
188                "     --network-macvlan=INTERFACE\n"
189                "                            Create a macvlan network interface based on an\n"
190                "                            existing network interface to the container\n"
191                "     --network-veth         Add a virtual ethernet connection between host\n"
192                "                            and container\n"
193                "     --network-bridge=INTERFACE\n"
194                "                            Add a virtual ethernet connection between host\n"
195                "                            and container and add it to an existing bridge on\n"
196                "                            the host\n"
197                "  -Z --selinux-context=SECLABEL\n"
198                "                            Set the SELinux security context to be used by\n"
199                "                            processes in the container\n"
200                "  -L --selinux-apifs-context=SECLABEL\n"
201                "                            Set the SELinux security context to be used by\n"
202                "                            API/tmpfs file systems in the container\n"
203                "     --capability=CAP       In addition to the default, retain specified\n"
204                "                            capability\n"
205                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
206                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host,\n"
207                "                            try-guest, try-host\n"
208                "  -j                        Equivalent to --link-journal=try-guest\n"
209                "     --read-only            Mount the root directory read-only\n"
210                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
211                "                            the container\n"
212                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
213                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
214                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
215                "     --share-system         Share system namespaces with host\n"
216                "     --register=BOOLEAN     Register container as machine\n"
217                "     --keep-unit            Do not register a scope for the machine, reuse\n"
218                "                            the service unit nspawn is running in\n"
219                "     --volatile[=MODE]      Run the system in volatile mode\n",
220                program_invocation_short_name);
221 }
222
223 static int parse_argv(int argc, char *argv[]) {
224
225         enum {
226                 ARG_VERSION = 0x100,
227                 ARG_PRIVATE_NETWORK,
228                 ARG_UUID,
229                 ARG_READ_ONLY,
230                 ARG_CAPABILITY,
231                 ARG_DROP_CAPABILITY,
232                 ARG_LINK_JOURNAL,
233                 ARG_BIND,
234                 ARG_BIND_RO,
235                 ARG_TMPFS,
236                 ARG_SETENV,
237                 ARG_SHARE_SYSTEM,
238                 ARG_REGISTER,
239                 ARG_KEEP_UNIT,
240                 ARG_NETWORK_INTERFACE,
241                 ARG_NETWORK_MACVLAN,
242                 ARG_NETWORK_VETH,
243                 ARG_NETWORK_BRIDGE,
244                 ARG_PERSONALITY,
245                 ARG_VOLATILE,
246         };
247
248         static const struct option options[] = {
249                 { "help",                  no_argument,       NULL, 'h'                   },
250                 { "version",               no_argument,       NULL, ARG_VERSION           },
251                 { "directory",             required_argument, NULL, 'D'                   },
252                 { "user",                  required_argument, NULL, 'u'                   },
253                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
254                 { "boot",                  no_argument,       NULL, 'b'                   },
255                 { "uuid",                  required_argument, NULL, ARG_UUID              },
256                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
257                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
258                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
259                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
260                 { "bind",                  required_argument, NULL, ARG_BIND              },
261                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
262                 { "tmpfs",                 required_argument, NULL, ARG_TMPFS             },
263                 { "machine",               required_argument, NULL, 'M'                   },
264                 { "slice",                 required_argument, NULL, 'S'                   },
265                 { "setenv",                required_argument, NULL, ARG_SETENV            },
266                 { "selinux-context",       required_argument, NULL, 'Z'                   },
267                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
268                 { "quiet",                 no_argument,       NULL, 'q'                   },
269                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
270                 { "register",              required_argument, NULL, ARG_REGISTER          },
271                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
272                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
273                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
274                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
275                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
276                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
277                 { "image",                 required_argument, NULL, 'i'                   },
278                 { "volatile",              optional_argument, NULL, ARG_VOLATILE          },
279                 {}
280         };
281
282         int c, r;
283         uint64_t plus = 0, minus = 0;
284
285         assert(argc >= 0);
286         assert(argv);
287
288         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0)
289
290                 switch (c) {
291
292                 case 'h':
293                         help();
294                         return 0;
295
296                 case ARG_VERSION:
297                         puts(PACKAGE_STRING);
298                         puts(SYSTEMD_FEATURES);
299                         return 0;
300
301                 case 'D':
302                         free(arg_directory);
303                         arg_directory = canonicalize_file_name(optarg);
304                         if (!arg_directory) {
305                                 log_error("Invalid root directory: %m");
306                                 return -ENOMEM;
307                         }
308
309                         break;
310
311                 case 'i':
312                         arg_image = optarg;
313                         break;
314
315                 case 'u':
316                         free(arg_user);
317                         arg_user = strdup(optarg);
318                         if (!arg_user)
319                                 return log_oom();
320
321                         break;
322
323                 case ARG_NETWORK_BRIDGE:
324                         arg_network_bridge = optarg;
325
326                         /* fall through */
327
328                 case ARG_NETWORK_VETH:
329                         arg_network_veth = true;
330                         arg_private_network = true;
331                         break;
332
333                 case ARG_NETWORK_INTERFACE:
334                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
335                                 return log_oom();
336
337                         arg_private_network = true;
338                         break;
339
340                 case ARG_NETWORK_MACVLAN:
341                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
342                                 return log_oom();
343
344                         /* fall through */
345
346                 case ARG_PRIVATE_NETWORK:
347                         arg_private_network = true;
348                         break;
349
350                 case 'b':
351                         arg_boot = true;
352                         break;
353
354                 case ARG_UUID:
355                         r = sd_id128_from_string(optarg, &arg_uuid);
356                         if (r < 0) {
357                                 log_error("Invalid UUID: %s", optarg);
358                                 return r;
359                         }
360                         break;
361
362                 case 'S':
363                         arg_slice = optarg;
364                         break;
365
366                 case 'M':
367                         if (isempty(optarg)) {
368                                 free(arg_machine);
369                                 arg_machine = NULL;
370                         } else {
371
372                                 if (!hostname_is_valid(optarg)) {
373                                         log_error("Invalid machine name: %s", optarg);
374                                         return -EINVAL;
375                                 }
376
377                                 free(arg_machine);
378                                 arg_machine = strdup(optarg);
379                                 if (!arg_machine)
380                                         return log_oom();
381
382                                 break;
383                         }
384
385                 case 'Z':
386                         arg_selinux_context = optarg;
387                         break;
388
389                 case 'L':
390                         arg_selinux_apifs_context = optarg;
391                         break;
392
393                 case ARG_READ_ONLY:
394                         arg_read_only = true;
395                         break;
396
397                 case ARG_CAPABILITY:
398                 case ARG_DROP_CAPABILITY: {
399                         const char *state, *word;
400                         size_t length;
401
402                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
403                                 _cleanup_free_ char *t;
404                                 cap_value_t cap;
405
406                                 t = strndup(word, length);
407                                 if (!t)
408                                         return log_oom();
409
410                                 if (streq(t, "all")) {
411                                         if (c == ARG_CAPABILITY)
412                                                 plus = (uint64_t) -1;
413                                         else
414                                                 minus = (uint64_t) -1;
415                                 } else {
416                                         if (cap_from_name(t, &cap) < 0) {
417                                                 log_error("Failed to parse capability %s.", t);
418                                                 return -EINVAL;
419                                         }
420
421                                         if (c == ARG_CAPABILITY)
422                                                 plus |= 1ULL << (uint64_t) cap;
423                                         else
424                                                 minus |= 1ULL << (uint64_t) cap;
425                                 }
426                         }
427
428                         break;
429                 }
430
431                 case 'j':
432                         arg_link_journal = LINK_GUEST;
433                         arg_link_journal_try = true;
434                         break;
435
436                 case ARG_LINK_JOURNAL:
437                         if (streq(optarg, "auto"))
438                                 arg_link_journal = LINK_AUTO;
439                         else if (streq(optarg, "no"))
440                                 arg_link_journal = LINK_NO;
441                         else if (streq(optarg, "guest"))
442                                 arg_link_journal = LINK_GUEST;
443                         else if (streq(optarg, "host"))
444                                 arg_link_journal = LINK_HOST;
445                         else if (streq(optarg, "try-guest")) {
446                                 arg_link_journal = LINK_GUEST;
447                                 arg_link_journal_try = true;
448                         } else if (streq(optarg, "try-host")) {
449                                 arg_link_journal = LINK_HOST;
450                                 arg_link_journal_try = true;
451                         } else {
452                                 log_error("Failed to parse link journal mode %s", optarg);
453                                 return -EINVAL;
454                         }
455
456                         break;
457
458                 case ARG_BIND:
459                 case ARG_BIND_RO: {
460                         _cleanup_free_ char *a = NULL, *b = NULL;
461                         char *e;
462                         char ***x;
463
464                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
465
466                         e = strchr(optarg, ':');
467                         if (e) {
468                                 a = strndup(optarg, e - optarg);
469                                 b = strdup(e + 1);
470                         } else {
471                                 a = strdup(optarg);
472                                 b = strdup(optarg);
473                         }
474
475                         if (!a || !b)
476                                 return log_oom();
477
478                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
479                                 log_error("Invalid bind mount specification: %s", optarg);
480                                 return -EINVAL;
481                         }
482
483                         r = strv_extend(x, a);
484                         if (r < 0)
485                                 return log_oom();
486
487                         r = strv_extend(x, b);
488                         if (r < 0)
489                                 return log_oom();
490
491                         break;
492                 }
493
494                 case ARG_TMPFS: {
495                         _cleanup_free_ char *a = NULL, *b = NULL;
496                         char *e;
497
498                         e = strchr(optarg, ':');
499                         if (e) {
500                                 a = strndup(optarg, e - optarg);
501                                 b = strdup(e + 1);
502                         } else {
503                                 a = strdup(optarg);
504                                 b = strdup("mode=0755");
505                         }
506
507                         if (!a || !b)
508                                 return log_oom();
509
510                         if (!path_is_absolute(a)) {
511                                 log_error("Invalid tmpfs specification: %s", optarg);
512                                 return -EINVAL;
513                         }
514
515                         r = strv_push(&arg_tmpfs, a);
516                         if (r < 0)
517                                 return log_oom();
518
519                         a = NULL;
520
521                         r = strv_push(&arg_tmpfs, b);
522                         if (r < 0)
523                                 return log_oom();
524
525                         b = NULL;
526
527                         break;
528                 }
529
530                 case ARG_SETENV: {
531                         char **n;
532
533                         if (!env_assignment_is_valid(optarg)) {
534                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
535                                 return -EINVAL;
536                         }
537
538                         n = strv_env_set(arg_setenv, optarg);
539                         if (!n)
540                                 return log_oom();
541
542                         strv_free(arg_setenv);
543                         arg_setenv = n;
544                         break;
545                 }
546
547                 case 'q':
548                         arg_quiet = true;
549                         break;
550
551                 case ARG_SHARE_SYSTEM:
552                         arg_share_system = true;
553                         break;
554
555                 case ARG_REGISTER:
556                         r = parse_boolean(optarg);
557                         if (r < 0) {
558                                 log_error("Failed to parse --register= argument: %s", optarg);
559                                 return r;
560                         }
561
562                         arg_register = r;
563                         break;
564
565                 case ARG_KEEP_UNIT:
566                         arg_keep_unit = true;
567                         break;
568
569                 case ARG_PERSONALITY:
570
571                         arg_personality = personality_from_string(optarg);
572                         if (arg_personality == 0xffffffffLU) {
573                                 log_error("Unknown or unsupported personality '%s'.", optarg);
574                                 return -EINVAL;
575                         }
576
577                         break;
578
579                 case ARG_VOLATILE:
580
581                         if (!optarg)
582                                 arg_volatile = VOLATILE_YES;
583                         else {
584                                 r = parse_boolean(optarg);
585                                 if (r < 0) {
586                                         if (streq(optarg, "state"))
587                                                 arg_volatile = VOLATILE_STATE;
588                                         else {
589                                                 log_error("Failed to parse --volatile= argument: %s", optarg);
590                                                 return r;
591                                         }
592                                 } else
593                                         arg_volatile = r ? VOLATILE_YES : VOLATILE_NO;
594                         }
595
596                         break;
597
598                 case '?':
599                         return -EINVAL;
600
601                 default:
602                         assert_not_reached("Unhandled option");
603                 }
604
605         if (arg_share_system)
606                 arg_register = false;
607
608         if (arg_boot && arg_share_system) {
609                 log_error("--boot and --share-system may not be combined.");
610                 return -EINVAL;
611         }
612
613         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
614                 log_error("--keep-unit may not be used when invoked from a user session.");
615                 return -EINVAL;
616         }
617
618         if (arg_directory && arg_image) {
619                 log_error("--directory= and --image= may not be combined.");
620                 return -EINVAL;
621         }
622
623         if (arg_volatile != VOLATILE_NO && arg_read_only) {
624                 log_error("Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
625                 return -EINVAL;
626         }
627
628         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
629
630         return 1;
631 }
632
633 static int mount_all(const char *dest) {
634
635         typedef struct MountPoint {
636                 const char *what;
637                 const char *where;
638                 const char *type;
639                 const char *options;
640                 unsigned long flags;
641                 bool fatal;
642         } MountPoint;
643
644         static const MountPoint mount_table[] = {
645                 { "proc",      "/proc",     "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,           true  },
646                 { "/proc/sys", "/proc/sys", NULL,    NULL,        MS_BIND,                                true  },   /* Bind mount first */
647                 { NULL,        "/proc/sys", NULL,    NULL,        MS_BIND|MS_RDONLY|MS_REMOUNT,           true  },   /* Then, make it r/o */
648                 { "sysfs",     "/sys",      "sysfs", NULL,        MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
649                 { "tmpfs",     "/dev",      "tmpfs", "mode=755",  MS_NOSUID|MS_STRICTATIME,               true  },
650                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
651                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
652                 { "tmpfs",     "/run",      "tmpfs", "mode=755",  MS_NOSUID|MS_NODEV|MS_STRICTATIME,      true  },
653 #ifdef HAVE_SELINUX
654                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                              false },  /* Bind mount first */
655                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT,         false },  /* Then, make it r/o */
656 #endif
657         };
658
659         unsigned k;
660         int r = 0;
661
662         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
663                 _cleanup_free_ char *where = NULL;
664 #ifdef HAVE_SELINUX
665                 _cleanup_free_ char *options = NULL;
666 #endif
667                 const char *o;
668                 int t;
669
670                 where = strjoin(dest, "/", mount_table[k].where, NULL);
671                 if (!where)
672                         return log_oom();
673
674                 t = path_is_mount_point(where, true);
675                 if (t < 0) {
676                         log_error_errno(t, "Failed to detect whether %s is a mount point: %m", where);
677
678                         if (r == 0)
679                                 r = t;
680
681                         continue;
682                 }
683
684                 /* Skip this entry if it is not a remount. */
685                 if (mount_table[k].what && t > 0)
686                         continue;
687
688                 t = mkdir_p(where, 0755);
689                 if (t < 0) {
690                         if (mount_table[k].fatal) {
691                                log_error_errno(t, "Failed to create directory %s: %m", where);
692
693                                 if (r == 0)
694                                         r = t;
695                         } else
696                                log_warning_errno(t, "Failed to create directory %s: %m", where);
697
698                         continue;
699                 }
700
701 #ifdef HAVE_SELINUX
702                 if (arg_selinux_apifs_context &&
703                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
704                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
705                         if (!options)
706                                 return log_oom();
707
708                         o = options;
709                 } else
710 #endif
711                         o = mount_table[k].options;
712
713
714                 if (mount(mount_table[k].what,
715                           where,
716                           mount_table[k].type,
717                           mount_table[k].flags,
718                           o) < 0) {
719
720                         if (mount_table[k].fatal) {
721                                 log_error("mount(%s) failed: %m", where);
722
723                                 if (r == 0)
724                                         r = -errno;
725                         } else
726                                 log_warning("mount(%s) failed: %m", where);
727                 }
728         }
729
730         return r;
731 }
732
733 static int mount_binds(const char *dest, char **l, bool ro) {
734         char **x, **y;
735
736         STRV_FOREACH_PAIR(x, y, l) {
737                 _cleanup_free_ char *where = NULL;
738                 struct stat source_st, dest_st;
739                 int r;
740
741                 if (stat(*x, &source_st) < 0) {
742                         log_error("Failed to stat %s: %m", *x);
743                         return -errno;
744                 }
745
746                 where = strappend(dest, *y);
747                 if (!where)
748                         return log_oom();
749
750                 r = stat(where, &dest_st);
751                 if (r == 0) {
752                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
753                                 log_error("The file types of %s and %s do not match. Refusing bind mount", *x, where);
754                                 return -EINVAL;
755                         }
756                 } else if (errno == ENOENT) {
757                         r = mkdir_parents_label(where, 0755);
758                         if (r < 0)
759                                 return log_error_errno(r, "Failed to bind mount %s: %m", *x);
760                 } else {
761                         log_error("Failed to bind mount %s: %m", *x);
762                         return -errno;
763                 }
764
765                 /* Create the mount point, but be conservative -- refuse to create block
766                  * and char devices. */
767                 if (S_ISDIR(source_st.st_mode)) {
768                         r = mkdir_label(where, 0755);
769                         if (r < 0 && errno != EEXIST)
770                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
771                 } else if (S_ISFIFO(source_st.st_mode)) {
772                         r = mkfifo(where, 0644);
773                         if (r < 0 && errno != EEXIST) {
774                                 log_error("Failed to create mount point %s: %m", where);
775
776                                 return -errno;
777                         }
778                 } else if (S_ISSOCK(source_st.st_mode)) {
779                         r = mknod(where, 0644 | S_IFSOCK, 0);
780                         if (r < 0 && errno != EEXIST) {
781                                 log_error("Failed to create mount point %s: %m", where);
782
783                                 return -errno;
784                         }
785                 } else if (S_ISREG(source_st.st_mode)) {
786                         r = touch(where);
787                         if (r < 0)
788                                 return log_error_errno(r, "Failed to create mount point %s: %m", where);
789                 } else {
790                         log_error("Refusing to create mountpoint for file: %s", *x);
791                         return -ENOTSUP;
792                 }
793
794                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
795                         log_error("mount(%s) failed: %m", where);
796                         return -errno;
797                 }
798
799                 if (ro) {
800                         r = bind_remount_recursive(where, true);
801                         if (r < 0)
802                                 return log_error_errno(r, "Read-Only bind mount failed: %m");
803                 }
804         }
805
806         return 0;
807 }
808
809 static int mount_tmpfs(const char *dest) {
810         char **i, **o;
811
812         STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
813                 _cleanup_free_ char *where = NULL;
814                 int r;
815
816                 where = strappend(dest, *i);
817                 if (!where)
818                         return log_oom();
819
820                 r = mkdir_label(where, 0755);
821                 if (r < 0 && errno != EEXIST)
822                         return log_error_errno(r, "creating mount point for tmpfs %s failed: %m", where);
823
824                 if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0) {
825                         log_error("tmpfs mount to %s failed: %m", where);
826                         return -errno;
827                 }
828         }
829
830         return 0;
831 }
832
833 static int setup_timezone(const char *dest) {
834         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
835         char *z, *y;
836         int r;
837
838         assert(dest);
839
840         /* Fix the timezone, if possible */
841         r = readlink_malloc("/etc/localtime", &p);
842         if (r < 0) {
843                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
844                 return 0;
845         }
846
847         z = path_startswith(p, "../usr/share/zoneinfo/");
848         if (!z)
849                 z = path_startswith(p, "/usr/share/zoneinfo/");
850         if (!z) {
851                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
852                 return 0;
853         }
854
855         where = strappend(dest, "/etc/localtime");
856         if (!where)
857                 return log_oom();
858
859         r = readlink_malloc(where, &q);
860         if (r >= 0) {
861                 y = path_startswith(q, "../usr/share/zoneinfo/");
862                 if (!y)
863                         y = path_startswith(q, "/usr/share/zoneinfo/");
864
865                 /* Already pointing to the right place? Then do nothing .. */
866                 if (y && streq(y, z))
867                         return 0;
868         }
869
870         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
871         if (!check)
872                 return log_oom();
873
874         if (access(check, F_OK) < 0) {
875                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
876                 return 0;
877         }
878
879         what = strappend("../usr/share/zoneinfo/", z);
880         if (!what)
881                 return log_oom();
882
883         r = mkdir_parents(where, 0755);
884         if (r < 0) {
885                 log_error_errno(r, "Failed to create directory for timezone info %s in container: %m", where);
886
887                 return 0;
888         }
889
890         r = unlink(where);
891         if (r < 0 && errno != ENOENT) {
892                 log_error("Failed to remove existing timezone info %s in container: %m", where);
893
894                 return 0;
895         }
896
897         if (symlink(what, where) < 0) {
898                 log_error("Failed to correct timezone of container: %m");
899                 return 0;
900         }
901
902         return 0;
903 }
904
905 static int setup_resolv_conf(const char *dest) {
906         _cleanup_free_ char *where = NULL;
907         int r;
908
909         assert(dest);
910
911         if (arg_private_network)
912                 return 0;
913
914         /* Fix resolv.conf, if possible */
915         where = strappend(dest, "/etc/resolv.conf");
916         if (!where)
917                 return log_oom();
918
919         /* We don't really care for the results of this really. If it
920          * fails, it fails, but meh... */
921         r = mkdir_parents(where, 0755);
922         if (r < 0) {
923                 log_warning_errno(r, "Failed to create parent directory for resolv.conf %s: %m", where);
924
925                 return 0;
926         }
927
928         r = copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW, 0644);
929         if (r < 0) {
930                 log_warning_errno(r, "Failed to copy /etc/resolv.conf to %s: %m", where);
931
932                 return 0;
933         }
934
935         return 0;
936 }
937
938 static int setup_volatile_state(const char *directory) {
939         const char *p;
940         int r;
941
942         assert(directory);
943
944         if (arg_volatile != VOLATILE_STATE)
945                 return 0;
946
947         /* --volatile=state means we simply overmount /var
948            with a tmpfs, and the rest read-only. */
949
950         r = bind_remount_recursive(directory, true);
951         if (r < 0)
952                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
953
954         p = strappenda(directory, "/var");
955         r = mkdir(p, 0755);
956         if (r < 0 && errno != EEXIST) {
957                 log_error("Failed to create %s: %m", directory);
958                 return -errno;
959         }
960
961         if (mount("tmpfs", p, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
962                 log_error("Failed to mount tmpfs to /var: %m");
963                 return -errno;
964         }
965
966         return 0;
967 }
968
969 static int setup_volatile(const char *directory) {
970         bool tmpfs_mounted = false, bind_mounted = false;
971         char template[] = "/tmp/nspawn-volatile-XXXXXX";
972         const char *f, *t;
973         int r;
974
975         assert(directory);
976
977         if (arg_volatile != VOLATILE_YES)
978                 return 0;
979
980         /* --volatile=yes means we mount a tmpfs to the root dir, and
981            the original /usr to use inside it, and that read-only. */
982
983         if (!mkdtemp(template)) {
984                 log_error("Failed to create temporary directory: %m");
985                 return -errno;
986         }
987
988         if (mount("tmpfs", template, "tmpfs", MS_STRICTATIME, "mode=755") < 0) {
989                 log_error("Failed to mount tmpfs for root directory: %m");
990                 r = -errno;
991                 goto fail;
992         }
993
994         tmpfs_mounted = true;
995
996         f = strappenda(directory, "/usr");
997         t = strappenda(template, "/usr");
998
999         r = mkdir(t, 0755);
1000         if (r < 0 && errno != EEXIST) {
1001                 log_error("Failed to create %s: %m", t);
1002                 r = -errno;
1003                 goto fail;
1004         }
1005
1006         if (mount(f, t, "bind", MS_BIND|MS_REC, NULL) < 0) {
1007                 log_error("Failed to create /usr bind mount: %m");
1008                 r = -errno;
1009                 goto fail;
1010         }
1011
1012         bind_mounted = true;
1013
1014         r = bind_remount_recursive(t, true);
1015         if (r < 0) {
1016                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1017                 goto fail;
1018         }
1019
1020         if (mount(template, directory, NULL, MS_MOVE, NULL) < 0) {
1021                 log_error("Failed to move root mount: %m");
1022                 r = -errno;
1023                 goto fail;
1024         }
1025
1026         rmdir(template);
1027
1028         return 0;
1029
1030 fail:
1031         if (bind_mounted)
1032                 umount(t);
1033         if (tmpfs_mounted)
1034                 umount(template);
1035         rmdir(template);
1036         return r;
1037 }
1038
1039 static char* id128_format_as_uuid(sd_id128_t id, char s[37]) {
1040
1041         snprintf(s, 37,
1042                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
1043                  SD_ID128_FORMAT_VAL(id));
1044
1045         return s;
1046 }
1047
1048 static int setup_boot_id(const char *dest) {
1049         _cleanup_free_ char *from = NULL, *to = NULL;
1050         sd_id128_t rnd = {};
1051         char as_uuid[37];
1052         int r;
1053
1054         assert(dest);
1055
1056         if (arg_share_system)
1057                 return 0;
1058
1059         /* Generate a new randomized boot ID, so that each boot-up of
1060          * the container gets a new one */
1061
1062         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
1063         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
1064         if (!from || !to)
1065                 return log_oom();
1066
1067         r = sd_id128_randomize(&rnd);
1068         if (r < 0)
1069                 return log_error_errno(r, "Failed to generate random boot id: %m");
1070
1071         id128_format_as_uuid(rnd, as_uuid);
1072
1073         r = write_string_file(from, as_uuid);
1074         if (r < 0)
1075                 return log_error_errno(r, "Failed to write boot id: %m");
1076
1077         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1078                 log_error("Failed to bind mount boot id: %m");
1079                 r = -errno;
1080         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
1081                 log_warning("Failed to make boot id read-only: %m");
1082
1083         unlink(from);
1084         return r;
1085 }
1086
1087 static int copy_devnodes(const char *dest) {
1088
1089         static const char devnodes[] =
1090                 "null\0"
1091                 "zero\0"
1092                 "full\0"
1093                 "random\0"
1094                 "urandom\0"
1095                 "tty\0"
1096                 "net/tun\0";
1097
1098         const char *d;
1099         int r = 0;
1100         _cleanup_umask_ mode_t u;
1101
1102         assert(dest);
1103
1104         u = umask(0000);
1105
1106         NULSTR_FOREACH(d, devnodes) {
1107                 _cleanup_free_ char *from = NULL, *to = NULL;
1108                 struct stat st;
1109
1110                 from = strappend("/dev/", d);
1111                 to = strjoin(dest, "/dev/", d, NULL);
1112                 if (!from || !to)
1113                         return log_oom();
1114
1115                 if (stat(from, &st) < 0) {
1116
1117                         if (errno != ENOENT) {
1118                                 log_error("Failed to stat %s: %m", from);
1119                                 return -errno;
1120                         }
1121
1122                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
1123
1124                         log_error("%s is not a char or block device, cannot copy", from);
1125                         return -EIO;
1126
1127                 } else {
1128                         r = mkdir_parents(to, 0775);
1129                         if (r < 0) {
1130                                 log_error_errno(r, "Failed to create parent directory of %s: %m", to);
1131                                 return -r;
1132                         }
1133
1134                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
1135                                 log_error("mknod(%s) failed: %m", dest);
1136                                 return  -errno;
1137                         }
1138                 }
1139         }
1140
1141         return r;
1142 }
1143
1144 static int setup_ptmx(const char *dest) {
1145         _cleanup_free_ char *p = NULL;
1146
1147         p = strappend(dest, "/dev/ptmx");
1148         if (!p)
1149                 return log_oom();
1150
1151         if (symlink("pts/ptmx", p) < 0) {
1152                 log_error("Failed to create /dev/ptmx symlink: %m");
1153                 return -errno;
1154         }
1155
1156         return 0;
1157 }
1158
1159 static int setup_dev_console(const char *dest, const char *console) {
1160         _cleanup_umask_ mode_t u;
1161         const char *to;
1162         struct stat st;
1163         int r;
1164
1165         assert(dest);
1166         assert(console);
1167
1168         u = umask(0000);
1169
1170         if (stat("/dev/null", &st) < 0) {
1171                 log_error("Failed to stat /dev/null: %m");
1172                 return -errno;
1173         }
1174
1175         r = chmod_and_chown(console, 0600, 0, 0);
1176         if (r < 0)
1177                 return log_error_errno(r, "Failed to correct access mode for TTY: %m");
1178
1179         /* We need to bind mount the right tty to /dev/console since
1180          * ptys can only exist on pts file systems. To have something
1181          * to bind mount things on we create a device node first, and
1182          * use /dev/null for that since we the cgroups device policy
1183          * allows us to create that freely, while we cannot create
1184          * /dev/console. (Note that the major minor doesn't actually
1185          * matter here, since we mount it over anyway). */
1186
1187         to = strappenda(dest, "/dev/console");
1188         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
1189                 log_error("mknod() for /dev/console failed: %m");
1190                 return -errno;
1191         }
1192
1193         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
1194                 log_error("Bind mount for /dev/console failed: %m");
1195                 return -errno;
1196         }
1197
1198         return 0;
1199 }
1200
1201 static int setup_kmsg(const char *dest, int kmsg_socket) {
1202         _cleanup_free_ char *from = NULL, *to = NULL;
1203         int r, fd, k;
1204         _cleanup_umask_ mode_t u;
1205         union {
1206                 struct cmsghdr cmsghdr;
1207                 uint8_t buf[CMSG_SPACE(sizeof(int))];
1208         } control = {};
1209         struct msghdr mh = {
1210                 .msg_control = &control,
1211                 .msg_controllen = sizeof(control),
1212         };
1213         struct cmsghdr *cmsg;
1214
1215         assert(dest);
1216         assert(kmsg_socket >= 0);
1217
1218         u = umask(0000);
1219
1220         /* We create the kmsg FIFO as /dev/kmsg, but immediately
1221          * delete it after bind mounting it to /proc/kmsg. While FIFOs
1222          * on the reading side behave very similar to /proc/kmsg,
1223          * their writing side behaves differently from /dev/kmsg in
1224          * that writing blocks when nothing is reading. In order to
1225          * avoid any problems with containers deadlocking due to this
1226          * we simply make /dev/kmsg unavailable to the container. */
1227         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
1228             asprintf(&to, "%s/proc/kmsg", dest) < 0)
1229                 return log_oom();
1230
1231         if (mkfifo(from, 0600) < 0) {
1232                 log_error("mkfifo() for /dev/kmsg failed: %m");
1233                 return -errno;
1234         }
1235
1236         r = chmod_and_chown(from, 0600, 0, 0);
1237         if (r < 0)
1238                 return log_error_errno(r, "Failed to correct access mode for /dev/kmsg: %m");
1239
1240         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
1241                 log_error("Bind mount for /proc/kmsg failed: %m");
1242                 return -errno;
1243         }
1244
1245         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
1246         if (fd < 0) {
1247                 log_error("Failed to open fifo: %m");
1248                 return -errno;
1249         }
1250
1251         cmsg = CMSG_FIRSTHDR(&mh);
1252         cmsg->cmsg_level = SOL_SOCKET;
1253         cmsg->cmsg_type = SCM_RIGHTS;
1254         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
1255         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
1256
1257         mh.msg_controllen = cmsg->cmsg_len;
1258
1259         /* Store away the fd in the socket, so that it stays open as
1260          * long as we run the child */
1261         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
1262         safe_close(fd);
1263
1264         if (k < 0) {
1265                 log_error("Failed to send FIFO fd: %m");
1266                 return -errno;
1267         }
1268
1269         /* And now make the FIFO unavailable as /dev/kmsg... */
1270         unlink(from);
1271         return 0;
1272 }
1273
1274 static int setup_hostname(void) {
1275
1276         if (arg_share_system)
1277                 return 0;
1278
1279         if (sethostname_idempotent(arg_machine) < 0)
1280                 return -errno;
1281
1282         return 0;
1283 }
1284
1285 static int setup_journal(const char *directory) {
1286         sd_id128_t machine_id, this_id;
1287         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1288         char *id;
1289         int r;
1290
1291         p = strappend(directory, "/etc/machine-id");
1292         if (!p)
1293                 return log_oom();
1294
1295         r = read_one_line_file(p, &b);
1296         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1297                 return 0;
1298         else if (r < 0)
1299                 return log_error_errno(r, "Failed to read machine ID from %s: %m", p);
1300
1301         id = strstrip(b);
1302         if (isempty(id) && arg_link_journal == LINK_AUTO)
1303                 return 0;
1304
1305         /* Verify validity */
1306         r = sd_id128_from_string(id, &machine_id);
1307         if (r < 0)
1308                 return log_error_errno(r, "Failed to parse machine ID from %s: %m", p);
1309
1310         r = sd_id128_get_machine(&this_id);
1311         if (r < 0)
1312                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
1313
1314         if (sd_id128_equal(machine_id, this_id)) {
1315                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1316                          "Host and machine ids are equal (%s): refusing to link journals", id);
1317                 if (arg_link_journal == LINK_AUTO)
1318                         return 0;
1319                 return
1320                         -EEXIST;
1321         }
1322
1323         if (arg_link_journal == LINK_NO)
1324                 return 0;
1325
1326         free(p);
1327         p = strappend("/var/log/journal/", id);
1328         q = strjoin(directory, "/var/log/journal/", id, NULL);
1329         if (!p || !q)
1330                 return log_oom();
1331
1332         if (path_is_mount_point(p, false) > 0) {
1333                 if (arg_link_journal != LINK_AUTO) {
1334                         log_error("%s: already a mount point, refusing to use for journal", p);
1335                         return -EEXIST;
1336                 }
1337
1338                 return 0;
1339         }
1340
1341         if (path_is_mount_point(q, false) > 0) {
1342                 if (arg_link_journal != LINK_AUTO) {
1343                         log_error("%s: already a mount point, refusing to use for journal", q);
1344                         return -EEXIST;
1345                 }
1346
1347                 return 0;
1348         }
1349
1350         r = readlink_and_make_absolute(p, &d);
1351         if (r >= 0) {
1352                 if ((arg_link_journal == LINK_GUEST ||
1353                      arg_link_journal == LINK_AUTO) &&
1354                     path_equal(d, q)) {
1355
1356                         r = mkdir_p(q, 0755);
1357                         if (r < 0)
1358                                 log_warning("Failed to create directory %s: %m", q);
1359                         return 0;
1360                 }
1361
1362                 if (unlink(p) < 0) {
1363                         log_error("Failed to remove symlink %s: %m", p);
1364                         return -errno;
1365                 }
1366         } else if (r == -EINVAL) {
1367
1368                 if (arg_link_journal == LINK_GUEST &&
1369                     rmdir(p) < 0) {
1370
1371                         if (errno == ENOTDIR) {
1372                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1373                                 return r;
1374                         } else {
1375                                 log_error("Failed to remove %s: %m", p);
1376                                 return -errno;
1377                         }
1378                 }
1379         } else if (r != -ENOENT) {
1380                 log_error("readlink(%s) failed: %m", p);
1381                 return r;
1382         }
1383
1384         if (arg_link_journal == LINK_GUEST) {
1385
1386                 if (symlink(q, p) < 0) {
1387                         if (arg_link_journal_try) {
1388                                 log_debug("Failed to symlink %s to %s, skipping journal setup: %m", q, p);
1389                                 return 0;
1390                         } else {
1391                                 log_error("Failed to symlink %s to %s: %m", q, p);
1392                                 return -errno;
1393                         }
1394                 }
1395
1396                 r = mkdir_p(q, 0755);
1397                 if (r < 0)
1398                         log_warning("Failed to create directory %s: %m", q);
1399                 return 0;
1400         }
1401
1402         if (arg_link_journal == LINK_HOST) {
1403                 /* don't create parents here -- if the host doesn't have
1404                  * permanent journal set up, don't force it here */
1405                 r = mkdir(p, 0755);
1406                 if (r < 0) {
1407                         if (arg_link_journal_try) {
1408                                 log_debug("Failed to create %s, skipping journal setup: %m", p);
1409                                 return 0;
1410                         } else {
1411                                 log_error("Failed to create %s: %m", p);
1412                                 return r;
1413                         }
1414                 }
1415
1416         } else if (access(p, F_OK) < 0)
1417                 return 0;
1418
1419         if (dir_is_empty(q) == 0)
1420                 log_warning("%s is not empty, proceeding anyway.", q);
1421
1422         r = mkdir_p(q, 0755);
1423         if (r < 0) {
1424                 log_error("Failed to create %s: %m", q);
1425                 return r;
1426         }
1427
1428         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1429                 log_error("Failed to bind mount journal from host into guest: %m");
1430                 return -errno;
1431         }
1432
1433         return 0;
1434 }
1435
1436 static int drop_capabilities(void) {
1437         return capability_bounding_set_drop(~arg_retain, false);
1438 }
1439
1440 static int register_machine(pid_t pid, int local_ifindex) {
1441         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1442         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1443         int r;
1444
1445         if (!arg_register)
1446                 return 0;
1447
1448         r = sd_bus_default_system(&bus);
1449         if (r < 0)
1450                 return log_error_errno(r, "Failed to open system bus: %m");
1451
1452         if (arg_keep_unit) {
1453                 r = sd_bus_call_method(
1454                                 bus,
1455                                 "org.freedesktop.machine1",
1456                                 "/org/freedesktop/machine1",
1457                                 "org.freedesktop.machine1.Manager",
1458                                 "RegisterMachineWithNetwork",
1459                                 &error,
1460                                 NULL,
1461                                 "sayssusai",
1462                                 arg_machine,
1463                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1464                                 "nspawn",
1465                                 "container",
1466                                 (uint32_t) pid,
1467                                 strempty(arg_directory),
1468                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1469         } else {
1470                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1471
1472                 r = sd_bus_message_new_method_call(
1473                                 bus,
1474                                 &m,
1475                                 "org.freedesktop.machine1",
1476                                 "/org/freedesktop/machine1",
1477                                 "org.freedesktop.machine1.Manager",
1478                                 "CreateMachineWithNetwork");
1479                 if (r < 0)
1480                         return log_error_errno(r, "Failed to create message: %m");
1481
1482                 r = sd_bus_message_append(
1483                                 m,
1484                                 "sayssusai",
1485                                 arg_machine,
1486                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1487                                 "nspawn",
1488                                 "container",
1489                                 (uint32_t) pid,
1490                                 strempty(arg_directory),
1491                                 local_ifindex > 0 ? 1 : 0, local_ifindex);
1492                 if (r < 0)
1493                         return log_error_errno(r, "Failed to append message arguments: %m");
1494
1495                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1496                 if (r < 0)
1497                         return log_error_errno(r, "Failed to open container: %m");
1498
1499                 if (!isempty(arg_slice)) {
1500                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1501                         if (r < 0)
1502                                 return log_error_errno(r, "Failed to append slice: %m");
1503                 }
1504
1505                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1506                 if (r < 0)
1507                         return log_error_errno(r, "Failed to add device policy: %m");
1508
1509                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 9,
1510                                           /* Allow the container to
1511                                            * access and create the API
1512                                            * device nodes, so that
1513                                            * PrivateDevices= in the
1514                                            * container can work
1515                                            * fine */
1516                                           "/dev/null", "rwm",
1517                                           "/dev/zero", "rwm",
1518                                           "/dev/full", "rwm",
1519                                           "/dev/random", "rwm",
1520                                           "/dev/urandom", "rwm",
1521                                           "/dev/tty", "rwm",
1522                                           "/dev/net/tun", "rwm",
1523                                           /* Allow the container
1524                                            * access to ptys. However,
1525                                            * do not permit the
1526                                            * container to ever create
1527                                            * these device nodes. */
1528                                           "/dev/pts/ptmx", "rw",
1529                                           "char-pts", "rw");
1530                 if (r < 0)
1531                         return log_error_errno(r, "Failed to add device whitelist: %m");
1532
1533                 r = sd_bus_message_close_container(m);
1534                 if (r < 0)
1535                         return log_error_errno(r, "Failed to close container: %m");
1536
1537                 r = sd_bus_call(bus, m, 0, &error, NULL);
1538         }
1539
1540         if (r < 0) {
1541                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1542                 return r;
1543         }
1544
1545         return 0;
1546 }
1547
1548 static int terminate_machine(pid_t pid) {
1549         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1550         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1551         _cleanup_bus_close_unref_ sd_bus *bus = NULL;
1552         const char *path;
1553         int r;
1554
1555         if (!arg_register)
1556                 return 0;
1557
1558         r = sd_bus_default_system(&bus);
1559         if (r < 0)
1560                 return log_error_errno(r, "Failed to open system bus: %m");
1561
1562         r = sd_bus_call_method(
1563                         bus,
1564                         "org.freedesktop.machine1",
1565                         "/org/freedesktop/machine1",
1566                         "org.freedesktop.machine1.Manager",
1567                         "GetMachineByPID",
1568                         &error,
1569                         &reply,
1570                         "u",
1571                         (uint32_t) pid);
1572         if (r < 0) {
1573                 /* Note that the machine might already have been
1574                  * cleaned up automatically, hence don't consider it a
1575                  * failure if we cannot get the machine object. */
1576                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1577                 return 0;
1578         }
1579
1580         r = sd_bus_message_read(reply, "o", &path);
1581         if (r < 0)
1582                 return bus_log_parse_error(r);
1583
1584         r = sd_bus_call_method(
1585                         bus,
1586                         "org.freedesktop.machine1",
1587                         path,
1588                         "org.freedesktop.machine1.Machine",
1589                         "Terminate",
1590                         &error,
1591                         NULL,
1592                         NULL);
1593         if (r < 0) {
1594                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1595                 return 0;
1596         }
1597
1598         return 0;
1599 }
1600
1601 static int reset_audit_loginuid(void) {
1602         _cleanup_free_ char *p = NULL;
1603         int r;
1604
1605         if (arg_share_system)
1606                 return 0;
1607
1608         r = read_one_line_file("/proc/self/loginuid", &p);
1609         if (r == -ENOENT)
1610                 return 0;
1611         if (r < 0)
1612                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
1613
1614         /* Already reset? */
1615         if (streq(p, "4294967295"))
1616                 return 0;
1617
1618         r = write_string_file("/proc/self/loginuid", "4294967295");
1619         if (r < 0) {
1620                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1621                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1622                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1623                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1624                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1625
1626                 sleep(5);
1627         }
1628
1629         return 0;
1630 }
1631
1632 #define HOST_HASH_KEY SD_ID128_MAKE(1a,37,6f,c7,46,ec,45,0b,ad,a3,d5,31,06,60,5d,b1)
1633 #define CONTAINER_HASH_KEY SD_ID128_MAKE(c3,c4,f9,19,b5,57,b2,1c,e6,cf,14,27,03,9c,ee,a2)
1634
1635 static int generate_mac(struct ether_addr *mac, sd_id128_t hash_key) {
1636         int r;
1637
1638         uint8_t result[8];
1639         size_t l, sz;
1640         uint8_t *v;
1641
1642         l = strlen(arg_machine);
1643         sz = sizeof(sd_id128_t) + l;
1644         v = alloca(sz);
1645
1646         /* fetch some persistent data unique to the host */
1647         r = sd_id128_get_machine((sd_id128_t*) v);
1648         if (r < 0)
1649                 return r;
1650
1651         /* combine with some data unique (on this host) to this
1652          * container instance */
1653         memcpy(v + sizeof(sd_id128_t), arg_machine, l);
1654
1655         /* Let's hash the host machine ID plus the container name. We
1656          * use a fixed, but originally randomly created hash key here. */
1657         siphash24(result, v, sz, hash_key.bytes);
1658
1659         assert_cc(ETH_ALEN <= sizeof(result));
1660         memcpy(mac->ether_addr_octet, result, ETH_ALEN);
1661
1662         /* see eth_random_addr in the kernel */
1663         mac->ether_addr_octet[0] &= 0xfe;        /* clear multicast bit */
1664         mac->ether_addr_octet[0] |= 0x02;        /* set local assignment bit (IEEE802) */
1665
1666         return 0;
1667 }
1668
1669 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ], int *ifi) {
1670         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1671         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1672         struct ether_addr mac_host, mac_container;
1673         int r, i;
1674
1675         if (!arg_private_network)
1676                 return 0;
1677
1678         if (!arg_network_veth)
1679                 return 0;
1680
1681         /* Use two different interface name prefixes depending whether
1682          * we are in bridge mode or not. */
1683         snprintf(iface_name, IFNAMSIZ - 1, "%s-%s",
1684                  arg_network_bridge ? "vb" : "ve", arg_machine);
1685
1686         r = generate_mac(&mac_container, CONTAINER_HASH_KEY);
1687         if (r < 0) {
1688                 log_error("Failed to generate predictable MAC address for container side");
1689                 return r;
1690         }
1691
1692         r = generate_mac(&mac_host, HOST_HASH_KEY);
1693         if (r < 0) {
1694                 log_error("Failed to generate predictable MAC address for host side");
1695                 return r;
1696         }
1697
1698         r = sd_rtnl_open(&rtnl, 0);
1699         if (r < 0)
1700                 return log_error_errno(r, "Failed to connect to netlink: %m");
1701
1702         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1703         if (r < 0)
1704                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1705
1706         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1707         if (r < 0)
1708                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1709
1710         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_host);
1711         if (r < 0)
1712                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1713
1714         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1715         if (r < 0)
1716                 return log_error_errno(r, "Failed to open netlink container: %m");
1717
1718         r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "veth");
1719         if (r < 0)
1720                 return log_error_errno(r, "Failed to open netlink container: %m");
1721
1722         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1723         if (r < 0)
1724                 return log_error_errno(r, "Failed to open netlink container: %m");
1725
1726         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1727         if (r < 0)
1728                 return log_error_errno(r, "Failed to add netlink interface name: %m");
1729
1730         r = sd_rtnl_message_append_ether_addr(m, IFLA_ADDRESS, &mac_container);
1731         if (r < 0)
1732                 return log_error_errno(r, "Failed to add netlink MAC address: %m");
1733
1734         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1735         if (r < 0)
1736                 return log_error_errno(r, "Failed to add netlink namespace field: %m");
1737
1738         r = sd_rtnl_message_close_container(m);
1739         if (r < 0)
1740                 return log_error_errno(r, "Failed to close netlink container: %m");
1741
1742         r = sd_rtnl_message_close_container(m);
1743         if (r < 0)
1744                 return log_error_errno(r, "Failed to close netlink container: %m");
1745
1746         r = sd_rtnl_message_close_container(m);
1747         if (r < 0)
1748                 return log_error_errno(r, "Failed to close netlink container: %m");
1749
1750         r = sd_rtnl_call(rtnl, m, 0, NULL);
1751         if (r < 0)
1752                 return log_error_errno(r, "Failed to add new veth interfaces: %m");
1753
1754         i = (int) if_nametoindex(iface_name);
1755         if (i <= 0) {
1756                 log_error("Failed to resolve interface %s: %m", iface_name);
1757                 return -errno;
1758         }
1759
1760         *ifi = i;
1761
1762         return 0;
1763 }
1764
1765 static int setup_bridge(const char veth_name[], int *ifi) {
1766         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1767         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1768         int r, bridge;
1769
1770         if (!arg_private_network)
1771                 return 0;
1772
1773         if (!arg_network_veth)
1774                 return 0;
1775
1776         if (!arg_network_bridge)
1777                 return 0;
1778
1779         bridge = (int) if_nametoindex(arg_network_bridge);
1780         if (bridge <= 0) {
1781                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1782                 return -errno;
1783         }
1784
1785         *ifi = bridge;
1786
1787         r = sd_rtnl_open(&rtnl, 0);
1788         if (r < 0)
1789                 return log_error_errno(r, "Failed to connect to netlink: %m");
1790
1791         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1792         if (r < 0)
1793                 return log_error_errno(r, "Failed to allocate netlink message: %m");
1794
1795         r = sd_rtnl_message_link_set_flags(m, IFF_UP, IFF_UP);
1796         if (r < 0)
1797                 return log_error_errno(r, "Failed to set IFF_UP flag: %m");
1798
1799         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1800         if (r < 0)
1801                 return log_error_errno(r, "Failed to add netlink interface name field: %m");
1802
1803         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1804         if (r < 0)
1805                 return log_error_errno(r, "Failed to add netlink master field: %m");
1806
1807         r = sd_rtnl_call(rtnl, m, 0, NULL);
1808         if (r < 0)
1809                 return log_error_errno(r, "Failed to add veth interface to bridge: %m");
1810
1811         return 0;
1812 }
1813
1814 static int parse_interface(struct udev *udev, const char *name) {
1815         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1816         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1817         int ifi;
1818
1819         ifi = (int) if_nametoindex(name);
1820         if (ifi <= 0) {
1821                 log_error("Failed to resolve interface %s: %m", name);
1822                 return -errno;
1823         }
1824
1825         sprintf(ifi_str, "n%i", ifi);
1826         d = udev_device_new_from_device_id(udev, ifi_str);
1827         if (!d) {
1828                 log_error("Failed to get udev device for interface %s: %m", name);
1829                 return -errno;
1830         }
1831
1832         if (udev_device_get_is_initialized(d) <= 0) {
1833                 log_error("Network interface %s is not initialized yet.", name);
1834                 return -EBUSY;
1835         }
1836
1837         return ifi;
1838 }
1839
1840 static int move_network_interfaces(pid_t pid) {
1841         _cleanup_udev_unref_ struct udev *udev = NULL;
1842         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1843         char **i;
1844         int r;
1845
1846         if (!arg_private_network)
1847                 return 0;
1848
1849         if (strv_isempty(arg_network_interfaces))
1850                 return 0;
1851
1852         r = sd_rtnl_open(&rtnl, 0);
1853         if (r < 0)
1854                 return log_error_errno(r, "Failed to connect to netlink: %m");
1855
1856         udev = udev_new();
1857         if (!udev) {
1858                 log_error("Failed to connect to udev.");
1859                 return -ENOMEM;
1860         }
1861
1862         STRV_FOREACH(i, arg_network_interfaces) {
1863                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1864                 int ifi;
1865
1866                 ifi = parse_interface(udev, *i);
1867                 if (ifi < 0)
1868                         return ifi;
1869
1870                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, ifi);
1871                 if (r < 0)
1872                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1873
1874                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1875                 if (r < 0)
1876                         return log_error_errno(r, "Failed to append namespace PID to netlink message: %m");
1877
1878                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1879                 if (r < 0)
1880                         return log_error_errno(r, "Failed to move interface %s to namespace: %m", *i);
1881         }
1882
1883         return 0;
1884 }
1885
1886 static int setup_macvlan(pid_t pid) {
1887         _cleanup_udev_unref_ struct udev *udev = NULL;
1888         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1889         char **i;
1890         int r;
1891
1892         if (!arg_private_network)
1893                 return 0;
1894
1895         if (strv_isempty(arg_network_macvlan))
1896                 return 0;
1897
1898         r = sd_rtnl_open(&rtnl, 0);
1899         if (r < 0)
1900                 return log_error_errno(r, "Failed to connect to netlink: %m");
1901
1902         udev = udev_new();
1903         if (!udev) {
1904                 log_error("Failed to connect to udev.");
1905                 return -ENOMEM;
1906         }
1907
1908         STRV_FOREACH(i, arg_network_macvlan) {
1909                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1910                 _cleanup_free_ char *n = NULL;
1911                 int ifi;
1912
1913                 ifi = parse_interface(udev, *i);
1914                 if (ifi < 0)
1915                         return ifi;
1916
1917                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1918                 if (r < 0)
1919                         return log_error_errno(r, "Failed to allocate netlink message: %m");
1920
1921                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1922                 if (r < 0)
1923                         return log_error_errno(r, "Failed to add netlink interface index: %m");
1924
1925                 n = strappend("mv-", *i);
1926                 if (!n)
1927                         return log_oom();
1928
1929                 strshorten(n, IFNAMSIZ-1);
1930
1931                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1932                 if (r < 0)
1933                         return log_error_errno(r, "Failed to add netlink interface name: %m");
1934
1935                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1936                 if (r < 0)
1937                         return log_error_errno(r, "Failed to add netlink namespace field: %m");
1938
1939                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1940                 if (r < 0)
1941                         return log_error_errno(r, "Failed to open netlink container: %m");
1942
1943                 r = sd_rtnl_message_open_container_union(m, IFLA_INFO_DATA, "macvlan");
1944                 if (r < 0)
1945                         return log_error_errno(r, "Failed to open netlink container: %m");
1946
1947                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1948                 if (r < 0)
1949                         return log_error_errno(r, "Failed to append macvlan mode: %m");
1950
1951                 r = sd_rtnl_message_close_container(m);
1952                 if (r < 0)
1953                         return log_error_errno(r, "Failed to close netlink container: %m");
1954
1955                 r = sd_rtnl_message_close_container(m);
1956                 if (r < 0)
1957                         return log_error_errno(r, "Failed to close netlink container: %m");
1958
1959                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1960                 if (r < 0)
1961                         return log_error_errno(r, "Failed to add new macvlan interfaces: %m");
1962         }
1963
1964         return 0;
1965 }
1966
1967 static int setup_seccomp(void) {
1968
1969 #ifdef HAVE_SECCOMP
1970         static const int blacklist[] = {
1971                 SCMP_SYS(kexec_load),
1972                 SCMP_SYS(open_by_handle_at),
1973                 SCMP_SYS(init_module),
1974                 SCMP_SYS(finit_module),
1975                 SCMP_SYS(delete_module),
1976                 SCMP_SYS(iopl),
1977                 SCMP_SYS(ioperm),
1978                 SCMP_SYS(swapon),
1979                 SCMP_SYS(swapoff),
1980         };
1981
1982         scmp_filter_ctx seccomp;
1983         unsigned i;
1984         int r;
1985
1986         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1987         if (!seccomp)
1988                 return log_oom();
1989
1990         r = seccomp_add_secondary_archs(seccomp);
1991         if (r < 0) {
1992                 log_error_errno(r, "Failed to add secondary archs to seccomp filter: %m");
1993                 goto finish;
1994         }
1995
1996         for (i = 0; i < ELEMENTSOF(blacklist); i++) {
1997                 r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM), blacklist[i], 0);
1998                 if (r == -EFAULT)
1999                         continue; /* unknown syscall */
2000                 if (r < 0) {
2001                         log_error_errno(r, "Failed to block syscall: %m");
2002                         goto finish;
2003                 }
2004         }
2005
2006         /*
2007            Audit is broken in containers, much of the userspace audit
2008            hookup will fail if running inside a container. We don't
2009            care and just turn off creation of audit sockets.
2010
2011            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
2012            with EAFNOSUPPORT which audit userspace uses as indication
2013            that audit is disabled in the kernel.
2014          */
2015
2016         r = seccomp_rule_add(
2017                         seccomp,
2018                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
2019                         SCMP_SYS(socket),
2020                         2,
2021                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
2022                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
2023         if (r < 0) {
2024                 log_error_errno(r, "Failed to add audit seccomp rule: %m");
2025                 goto finish;
2026         }
2027
2028         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2029         if (r < 0) {
2030                 log_error_errno(r, "Failed to unset NO_NEW_PRIVS: %m");
2031                 goto finish;
2032         }
2033
2034         r = seccomp_load(seccomp);
2035         if (r < 0)
2036                 log_error_errno(r, "Failed to install seccomp audit filter: %m");
2037
2038 finish:
2039         seccomp_release(seccomp);
2040         return r;
2041 #else
2042         return 0;
2043 #endif
2044
2045 }
2046
2047 static int setup_image(char **device_path, int *loop_nr) {
2048         struct loop_info64 info = {
2049                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
2050         };
2051         _cleanup_close_ int fd = -1, control = -1, loop = -1;
2052         _cleanup_free_ char* loopdev = NULL;
2053         struct stat st;
2054         int r, nr;
2055
2056         assert(device_path);
2057         assert(loop_nr);
2058
2059         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2060         if (fd < 0) {
2061                 log_error("Failed to open %s: %m", arg_image);
2062                 return -errno;
2063         }
2064
2065         if (fstat(fd, &st) < 0) {
2066                 log_error("Failed to stat %s: %m", arg_image);
2067                 return -errno;
2068         }
2069
2070         if (S_ISBLK(st.st_mode)) {
2071                 char *p;
2072
2073                 p = strdup(arg_image);
2074                 if (!p)
2075                         return log_oom();
2076
2077                 *device_path = p;
2078
2079                 *loop_nr = -1;
2080
2081                 r = fd;
2082                 fd = -1;
2083
2084                 return r;
2085         }
2086
2087         if (!S_ISREG(st.st_mode)) {
2088                 log_error("%s is not a regular file or block device: %m", arg_image);
2089                 return -EINVAL;
2090         }
2091
2092         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2093         if (control < 0) {
2094                 log_error("Failed to open /dev/loop-control: %m");
2095                 return -errno;
2096         }
2097
2098         nr = ioctl(control, LOOP_CTL_GET_FREE);
2099         if (nr < 0) {
2100                 log_error("Failed to allocate loop device: %m");
2101                 return -errno;
2102         }
2103
2104         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
2105                 return log_oom();
2106
2107         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
2108         if (loop < 0) {
2109                 log_error("Failed to open loop device %s: %m", loopdev);
2110                 return -errno;
2111         }
2112
2113         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
2114                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
2115                 return -errno;
2116         }
2117
2118         if (arg_read_only)
2119                 info.lo_flags |= LO_FLAGS_READ_ONLY;
2120
2121         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
2122                 log_error("Failed to set loopback settings on %s: %m", loopdev);
2123                 return -errno;
2124         }
2125
2126         *device_path = loopdev;
2127         loopdev = NULL;
2128
2129         *loop_nr = nr;
2130
2131         r = loop;
2132         loop = -1;
2133
2134         return r;
2135 }
2136
2137 static int dissect_image(
2138                 int fd,
2139                 char **root_device, bool *root_device_rw,
2140                 char **home_device, bool *home_device_rw,
2141                 char **srv_device, bool *srv_device_rw,
2142                 bool *secondary) {
2143
2144 #ifdef HAVE_BLKID
2145         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
2146         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
2147         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
2148         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
2149         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2150         _cleanup_udev_unref_ struct udev *udev = NULL;
2151         struct udev_list_entry *first, *item;
2152         bool home_rw = true, root_rw = true, secondary_root_rw = true, srv_rw = true;
2153         const char *pttype = NULL;
2154         blkid_partlist pl;
2155         struct stat st;
2156         int r;
2157
2158         assert(fd >= 0);
2159         assert(root_device);
2160         assert(home_device);
2161         assert(srv_device);
2162         assert(secondary);
2163
2164         b = blkid_new_probe();
2165         if (!b)
2166                 return log_oom();
2167
2168         errno = 0;
2169         r = blkid_probe_set_device(b, fd, 0, 0);
2170         if (r != 0) {
2171                 if (errno == 0)
2172                         return log_oom();
2173
2174                 log_error("Failed to set device on blkid probe: %m");
2175                 return -errno;
2176         }
2177
2178         blkid_probe_enable_partitions(b, 1);
2179         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
2180
2181         errno = 0;
2182         r = blkid_do_safeprobe(b);
2183         if (r == -2 || r == 1) {
2184                 log_error("Failed to identify any partition table on %s.\n"
2185                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2186                 return -EINVAL;
2187         } else if (r != 0) {
2188                 if (errno == 0)
2189                         errno = EIO;
2190                 log_error("Failed to probe: %m");
2191                 return -errno;
2192         }
2193
2194         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
2195         if (!streq_ptr(pttype, "gpt")) {
2196                 log_error("Image %s does not carry a GUID Partition Table.\n"
2197                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2198                 return -EINVAL;
2199         }
2200
2201         errno = 0;
2202         pl = blkid_probe_get_partitions(b);
2203         if (!pl) {
2204                 if (errno == 0)
2205                         return log_oom();
2206
2207                 log_error("Failed to list partitions of %s", arg_image);
2208                 return -errno;
2209         }
2210
2211         udev = udev_new();
2212         if (!udev)
2213                 return log_oom();
2214
2215         if (fstat(fd, &st) < 0) {
2216                 log_error("Failed to stat block device: %m");
2217                 return -errno;
2218         }
2219
2220         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
2221         if (!d)
2222                 return log_oom();
2223
2224         e = udev_enumerate_new(udev);
2225         if (!e)
2226                 return log_oom();
2227
2228         r = udev_enumerate_add_match_parent(e, d);
2229         if (r < 0)
2230                 return log_oom();
2231
2232         r = udev_enumerate_scan_devices(e);
2233         if (r < 0)
2234                 return log_error_errno(r, "Failed to scan for partition devices of %s: %m", arg_image);
2235
2236         first = udev_enumerate_get_list_entry(e);
2237         udev_list_entry_foreach(item, first) {
2238                 _cleanup_udev_device_unref_ struct udev_device *q;
2239                 const char *stype, *node;
2240                 unsigned long long flags;
2241                 sd_id128_t type_id;
2242                 blkid_partition pp;
2243                 dev_t qn;
2244                 int nr;
2245
2246                 errno = 0;
2247                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2248                 if (!q) {
2249                         if (!errno)
2250                                 errno = ENOMEM;
2251
2252                         log_error("Failed to get partition device of %s: %m", arg_image);
2253                         return -errno;
2254                 }
2255
2256                 qn = udev_device_get_devnum(q);
2257                 if (major(qn) == 0)
2258                         continue;
2259
2260                 if (st.st_rdev == qn)
2261                         continue;
2262
2263                 node = udev_device_get_devnode(q);
2264                 if (!node)
2265                         continue;
2266
2267                 pp = blkid_partlist_devno_to_partition(pl, qn);
2268                 if (!pp)
2269                         continue;
2270
2271                 flags = blkid_partition_get_flags(pp);
2272                 if (flags & GPT_FLAG_NO_AUTO)
2273                         continue;
2274
2275                 nr = blkid_partition_get_partno(pp);
2276                 if (nr < 0)
2277                         continue;
2278
2279                 stype = blkid_partition_get_type_string(pp);
2280                 if (!stype)
2281                         continue;
2282
2283                 if (sd_id128_from_string(stype, &type_id) < 0)
2284                         continue;
2285
2286                 if (sd_id128_equal(type_id, GPT_HOME)) {
2287
2288                         if (home && nr >= home_nr)
2289                                 continue;
2290
2291                         home_nr = nr;
2292                         home_rw = !(flags & GPT_FLAG_READ_ONLY);
2293
2294                         free(home);
2295                         home = strdup(node);
2296                         if (!home)
2297                                 return log_oom();
2298                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2299
2300                         if (srv && nr >= srv_nr)
2301                                 continue;
2302
2303                         srv_nr = nr;
2304                         srv_rw = !(flags & GPT_FLAG_READ_ONLY);
2305
2306                         free(srv);
2307                         srv = strdup(node);
2308                         if (!srv)
2309                                 return log_oom();
2310                 }
2311 #ifdef GPT_ROOT_NATIVE
2312                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2313
2314                         if (root && nr >= root_nr)
2315                                 continue;
2316
2317                         root_nr = nr;
2318                         root_rw = !(flags & GPT_FLAG_READ_ONLY);
2319
2320                         free(root);
2321                         root = strdup(node);
2322                         if (!root)
2323                                 return log_oom();
2324                 }
2325 #endif
2326 #ifdef GPT_ROOT_SECONDARY
2327                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2328
2329                         if (secondary_root && nr >= secondary_root_nr)
2330                                 continue;
2331
2332                         secondary_root_nr = nr;
2333                         secondary_root_rw = !(flags & GPT_FLAG_READ_ONLY);
2334
2335
2336                         free(secondary_root);
2337                         secondary_root = strdup(node);
2338                         if (!secondary_root)
2339                                 return log_oom();
2340                 }
2341 #endif
2342         }
2343
2344         if (!root && !secondary_root) {
2345                 log_error("Failed to identify root partition in disk image %s.\n"
2346                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2347                 return -EINVAL;
2348         }
2349
2350         if (root) {
2351                 *root_device = root;
2352                 root = NULL;
2353
2354                 *root_device_rw = root_rw;
2355                 *secondary = false;
2356         } else if (secondary_root) {
2357                 *root_device = secondary_root;
2358                 secondary_root = NULL;
2359
2360                 *root_device_rw = secondary_root_rw;
2361                 *secondary = true;
2362         }
2363
2364         if (home) {
2365                 *home_device = home;
2366                 home = NULL;
2367
2368                 *home_device_rw = home_rw;
2369         }
2370
2371         if (srv) {
2372                 *srv_device = srv;
2373                 srv = NULL;
2374
2375                 *srv_device_rw = srv_rw;
2376         }
2377
2378         return 0;
2379 #else
2380         log_error("--image= is not supported, compiled without blkid support.");
2381         return -ENOTSUP;
2382 #endif
2383 }
2384
2385 static int mount_device(const char *what, const char *where, const char *directory, bool rw) {
2386 #ifdef HAVE_BLKID
2387         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2388         const char *fstype, *p;
2389         int r;
2390
2391         assert(what);
2392         assert(where);
2393
2394         if (arg_read_only)
2395                 rw = false;
2396
2397         if (directory)
2398                 p = strappenda(where, directory);
2399         else
2400                 p = where;
2401
2402         errno = 0;
2403         b = blkid_new_probe_from_filename(what);
2404         if (!b) {
2405                 if (errno == 0)
2406                         return log_oom();
2407                 log_error("Failed to allocate prober for %s: %m", what);
2408                 return -errno;
2409         }
2410
2411         blkid_probe_enable_superblocks(b, 1);
2412         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2413
2414         errno = 0;
2415         r = blkid_do_safeprobe(b);
2416         if (r == -1 || r == 1) {
2417                 log_error("Cannot determine file system type of %s", what);
2418                 return -EINVAL;
2419         } else if (r != 0) {
2420                 if (errno == 0)
2421                         errno = EIO;
2422                 log_error("Failed to probe %s: %m", what);
2423                 return -errno;
2424         }
2425
2426         errno = 0;
2427         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2428                 if (errno == 0)
2429                         errno = EINVAL;
2430                 log_error("Failed to determine file system type of %s", what);
2431                 return -errno;
2432         }
2433
2434         if (streq(fstype, "crypto_LUKS")) {
2435                 log_error("nspawn currently does not support LUKS disk images.");
2436                 return -ENOTSUP;
2437         }
2438
2439         if (mount(what, p, fstype, MS_NODEV|(rw ? 0 : MS_RDONLY), NULL) < 0) {
2440                 log_error("Failed to mount %s: %m", what);
2441                 return -errno;
2442         }
2443
2444         return 0;
2445 #else
2446         log_error("--image= is not supported, compiled without blkid support.");
2447         return -ENOTSUP;
2448 #endif
2449 }
2450
2451 static int mount_devices(
2452                 const char *where,
2453                 const char *root_device, bool root_device_rw,
2454                 const char *home_device, bool home_device_rw,
2455                 const char *srv_device, bool srv_device_rw) {
2456         int r;
2457
2458         assert(where);
2459
2460         if (root_device) {
2461                 r = mount_device(root_device, arg_directory, NULL, root_device_rw);
2462                 if (r < 0)
2463                         return log_error_errno(r, "Failed to mount root directory: %m");
2464         }
2465
2466         if (home_device) {
2467                 r = mount_device(home_device, arg_directory, "/home", home_device_rw);
2468                 if (r < 0)
2469                         return log_error_errno(r, "Failed to mount home directory: %m");
2470         }
2471
2472         if (srv_device) {
2473                 r = mount_device(srv_device, arg_directory, "/srv", srv_device_rw);
2474                 if (r < 0)
2475                         return log_error_errno(r, "Failed to mount server data directory: %m");
2476         }
2477
2478         return 0;
2479 }
2480
2481 static void loop_remove(int nr, int *image_fd) {
2482         _cleanup_close_ int control = -1;
2483         int r;
2484
2485         if (nr < 0)
2486                 return;
2487
2488         if (image_fd && *image_fd >= 0) {
2489                 r = ioctl(*image_fd, LOOP_CLR_FD);
2490                 if (r < 0)
2491                         log_warning("Failed to close loop image: %m");
2492                 *image_fd = safe_close(*image_fd);
2493         }
2494
2495         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2496         if (control < 0) {
2497                 log_warning("Failed to open /dev/loop-control: %m");
2498                 return;
2499         }
2500
2501         r = ioctl(control, LOOP_CTL_REMOVE, nr);
2502         if (r < 0)
2503                 log_warning("Failed to remove loop %d: %m", nr);
2504 }
2505
2506 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2507         int pipe_fds[2];
2508         pid_t pid;
2509
2510         assert(database);
2511         assert(key);
2512         assert(rpid);
2513
2514         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2515                 log_error("Failed to allocate pipe: %m");
2516                 return -errno;
2517         }
2518
2519         pid = fork();
2520         if (pid < 0) {
2521                 log_error("Failed to fork getent child: %m");
2522                 return -errno;
2523         } else if (pid == 0) {
2524                 int nullfd;
2525                 char *empty_env = NULL;
2526
2527                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2528                         _exit(EXIT_FAILURE);
2529
2530                 if (pipe_fds[0] > 2)
2531                         safe_close(pipe_fds[0]);
2532                 if (pipe_fds[1] > 2)
2533                         safe_close(pipe_fds[1]);
2534
2535                 nullfd = open("/dev/null", O_RDWR);
2536                 if (nullfd < 0)
2537                         _exit(EXIT_FAILURE);
2538
2539                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2540                         _exit(EXIT_FAILURE);
2541
2542                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2543                         _exit(EXIT_FAILURE);
2544
2545                 if (nullfd > 2)
2546                         safe_close(nullfd);
2547
2548                 reset_all_signal_handlers();
2549                 close_all_fds(NULL, 0);
2550
2551                 execle("/usr/bin/getent", "getent", database, key, NULL, &empty_env);
2552                 execle("/bin/getent", "getent", database, key, NULL, &empty_env);
2553                 _exit(EXIT_FAILURE);
2554         }
2555
2556         pipe_fds[1] = safe_close(pipe_fds[1]);
2557
2558         *rpid = pid;
2559
2560         return pipe_fds[0];
2561 }
2562
2563 static int change_uid_gid(char **_home) {
2564         char line[LINE_MAX], *x, *u, *g, *h;
2565         const char *word, *state;
2566         _cleanup_free_ uid_t *uids = NULL;
2567         _cleanup_free_ char *home = NULL;
2568         _cleanup_fclose_ FILE *f = NULL;
2569         _cleanup_close_ int fd = -1;
2570         unsigned n_uids = 0;
2571         size_t sz = 0, l;
2572         uid_t uid;
2573         gid_t gid;
2574         pid_t pid;
2575         int r;
2576
2577         assert(_home);
2578
2579         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2580                 /* Reset everything fully to 0, just in case */
2581
2582                 if (setgroups(0, NULL) < 0) {
2583                         log_error("setgroups() failed: %m");
2584                         return -errno;
2585                 }
2586
2587                 if (setresgid(0, 0, 0) < 0) {
2588                         log_error("setregid() failed: %m");
2589                         return -errno;
2590                 }
2591
2592                 if (setresuid(0, 0, 0) < 0) {
2593                         log_error("setreuid() failed: %m");
2594                         return -errno;
2595                 }
2596
2597                 *_home = NULL;
2598                 return 0;
2599         }
2600
2601         /* First, get user credentials */
2602         fd = spawn_getent("passwd", arg_user, &pid);
2603         if (fd < 0)
2604                 return fd;
2605
2606         f = fdopen(fd, "r");
2607         if (!f)
2608                 return log_oom();
2609         fd = -1;
2610
2611         if (!fgets(line, sizeof(line), f)) {
2612
2613                 if (!ferror(f)) {
2614                         log_error("Failed to resolve user %s.", arg_user);
2615                         return -ESRCH;
2616                 }
2617
2618                 log_error("Failed to read from getent: %m");
2619                 return -errno;
2620         }
2621
2622         truncate_nl(line);
2623
2624         wait_for_terminate_and_warn("getent passwd", pid);
2625
2626         x = strchr(line, ':');
2627         if (!x) {
2628                 log_error("/etc/passwd entry has invalid user field.");
2629                 return -EIO;
2630         }
2631
2632         u = strchr(x+1, ':');
2633         if (!u) {
2634                 log_error("/etc/passwd entry has invalid password field.");
2635                 return -EIO;
2636         }
2637
2638         u++;
2639         g = strchr(u, ':');
2640         if (!g) {
2641                 log_error("/etc/passwd entry has invalid UID field.");
2642                 return -EIO;
2643         }
2644
2645         *g = 0;
2646         g++;
2647         x = strchr(g, ':');
2648         if (!x) {
2649                 log_error("/etc/passwd entry has invalid GID field.");
2650                 return -EIO;
2651         }
2652
2653         *x = 0;
2654         h = strchr(x+1, ':');
2655         if (!h) {
2656                 log_error("/etc/passwd entry has invalid GECOS field.");
2657                 return -EIO;
2658         }
2659
2660         h++;
2661         x = strchr(h, ':');
2662         if (!x) {
2663                 log_error("/etc/passwd entry has invalid home directory field.");
2664                 return -EIO;
2665         }
2666
2667         *x = 0;
2668
2669         r = parse_uid(u, &uid);
2670         if (r < 0) {
2671                 log_error("Failed to parse UID of user.");
2672                 return -EIO;
2673         }
2674
2675         r = parse_gid(g, &gid);
2676         if (r < 0) {
2677                 log_error("Failed to parse GID of user.");
2678                 return -EIO;
2679         }
2680
2681         home = strdup(h);
2682         if (!home)
2683                 return log_oom();
2684
2685         /* Second, get group memberships */
2686         fd = spawn_getent("initgroups", arg_user, &pid);
2687         if (fd < 0)
2688                 return fd;
2689
2690         fclose(f);
2691         f = fdopen(fd, "r");
2692         if (!f)
2693                 return log_oom();
2694         fd = -1;
2695
2696         if (!fgets(line, sizeof(line), f)) {
2697                 if (!ferror(f)) {
2698                         log_error("Failed to resolve user %s.", arg_user);
2699                         return -ESRCH;
2700                 }
2701
2702                 log_error("Failed to read from getent: %m");
2703                 return -errno;
2704         }
2705
2706         truncate_nl(line);
2707
2708         wait_for_terminate_and_warn("getent initgroups", pid);
2709
2710         /* Skip over the username and subsequent separator whitespace */
2711         x = line;
2712         x += strcspn(x, WHITESPACE);
2713         x += strspn(x, WHITESPACE);
2714
2715         FOREACH_WORD(word, l, x, state) {
2716                 char c[l+1];
2717
2718                 memcpy(c, word, l);
2719                 c[l] = 0;
2720
2721                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2722                         return log_oom();
2723
2724                 r = parse_uid(c, &uids[n_uids++]);
2725                 if (r < 0) {
2726                         log_error("Failed to parse group data from getent.");
2727                         return -EIO;
2728                 }
2729         }
2730
2731         r = mkdir_parents(home, 0775);
2732         if (r < 0)
2733                 return log_error_errno(r, "Failed to make home root directory: %m");
2734
2735         r = mkdir_safe(home, 0755, uid, gid);
2736         if (r < 0 && r != -EEXIST)
2737                 return log_error_errno(r, "Failed to make home directory: %m");
2738
2739         fchown(STDIN_FILENO, uid, gid);
2740         fchown(STDOUT_FILENO, uid, gid);
2741         fchown(STDERR_FILENO, uid, gid);
2742
2743         if (setgroups(n_uids, uids) < 0) {
2744                 log_error("Failed to set auxiliary groups: %m");
2745                 return -errno;
2746         }
2747
2748         if (setresgid(gid, gid, gid) < 0) {
2749                 log_error("setregid() failed: %m");
2750                 return -errno;
2751         }
2752
2753         if (setresuid(uid, uid, uid) < 0) {
2754                 log_error("setreuid() failed: %m");
2755                 return -errno;
2756         }
2757
2758         if (_home) {
2759                 *_home = home;
2760                 home = NULL;
2761         }
2762
2763         return 0;
2764 }
2765
2766 /*
2767  * Return values:
2768  * < 0 : wait_for_terminate() failed to get the state of the
2769  *       container, the container was terminated by a signal, or
2770  *       failed for an unknown reason.  No change is made to the
2771  *       container argument.
2772  * > 0 : The program executed in the container terminated with an
2773  *       error.  The exit code of the program executed in the
2774  *       container is returned.  The container argument has been set
2775  *       to CONTAINER_TERMINATED.
2776  *   0 : The container is being rebooted, has been shut down or exited
2777  *       successfully.  The container argument has been set to either
2778  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2779  *
2780  * That is, success is indicated by a return value of zero, and an
2781  * error is indicated by a non-zero value.
2782  */
2783 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2784         siginfo_t status;
2785         int r;
2786
2787         r = wait_for_terminate(pid, &status);
2788         if (r < 0)
2789                 return log_warning_errno(r, "Failed to wait for container: %m");
2790
2791         switch (status.si_code) {
2792
2793         case CLD_EXITED:
2794                 if (status.si_status == 0) {
2795                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2796
2797                 } else
2798                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2799
2800                 *container = CONTAINER_TERMINATED;
2801                 return status.si_status;
2802
2803         case CLD_KILLED:
2804                 if (status.si_status == SIGINT) {
2805
2806                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2807                         *container = CONTAINER_TERMINATED;
2808                         return 0;
2809
2810                 } else if (status.si_status == SIGHUP) {
2811
2812                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2813                         *container = CONTAINER_REBOOTED;
2814                         return 0;
2815                 }
2816
2817                 /* CLD_KILLED fallthrough */
2818
2819         case CLD_DUMPED:
2820                 log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2821                 return -EIO;
2822
2823         default:
2824                 log_error("Container %s failed due to unknown reason.", arg_machine);
2825                 return -EIO;
2826         }
2827
2828         return r;
2829 }
2830
2831 static void nop_handler(int sig) {}
2832
2833 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2834         pid_t pid;
2835
2836         pid = PTR_TO_UINT32(userdata);
2837         if (pid > 0) {
2838                 if (kill(pid, SIGRTMIN+3) >= 0) {
2839                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2840                         sd_event_source_set_userdata(s, NULL);
2841                         return 0;
2842                 }
2843         }
2844
2845         sd_event_exit(sd_event_source_get_event(s), 0);
2846         return 0;
2847 }
2848
2849 int main(int argc, char *argv[]) {
2850
2851         _cleanup_free_ char *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2852         bool root_device_rw = true, home_device_rw = true, srv_device_rw = true;
2853         _cleanup_close_ int master = -1, image_fd = -1;
2854         _cleanup_close_pair_ int kmsg_socket_pair[2] = { -1, -1 };
2855         _cleanup_fdset_free_ FDSet *fds = NULL;
2856         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2857         const char *console = NULL;
2858         char veth_name[IFNAMSIZ];
2859         bool secondary = false;
2860         sigset_t mask, mask_chld;
2861         pid_t pid = 0;
2862
2863         log_parse_environment();
2864         log_open();
2865
2866         k = parse_argv(argc, argv);
2867         if (k < 0)
2868                 goto finish;
2869         else if (k == 0) {
2870                 r = EXIT_SUCCESS;
2871                 goto finish;
2872         }
2873
2874         if (!arg_image) {
2875                 if (arg_directory) {
2876                         char *p;
2877
2878                         p = path_make_absolute_cwd(arg_directory);
2879                         free(arg_directory);
2880                         arg_directory = p;
2881                 } else
2882                         arg_directory = get_current_dir_name();
2883
2884                 if (!arg_directory) {
2885                         log_error("Failed to determine path, please use -D.");
2886                         goto finish;
2887                 }
2888                 path_kill_slashes(arg_directory);
2889         }
2890
2891         if (!arg_machine) {
2892                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2893                 if (!arg_machine) {
2894                         log_oom();
2895                         goto finish;
2896                 }
2897
2898                 hostname_cleanup(arg_machine, false);
2899                 if (isempty(arg_machine)) {
2900                         log_error("Failed to determine machine name automatically, please use -M.");
2901                         goto finish;
2902                 }
2903         }
2904
2905         if (geteuid() != 0) {
2906                 log_error("Need to be root.");
2907                 goto finish;
2908         }
2909
2910         if (sd_booted() <= 0) {
2911                 log_error("Not running on a systemd system.");
2912                 goto finish;
2913         }
2914
2915         log_close();
2916         n_fd_passed = sd_listen_fds(false);
2917         if (n_fd_passed > 0) {
2918                 k = fdset_new_listen_fds(&fds, false);
2919                 if (k < 0) {
2920                         log_error_errno(k, "Failed to collect file descriptors: %m");
2921                         goto finish;
2922                 }
2923         }
2924         fdset_close_others(fds);
2925         log_open();
2926
2927         if (arg_directory) {
2928                 if (path_equal(arg_directory, "/")) {
2929                         log_error("Spawning container on root directory not supported.");
2930                         goto finish;
2931                 }
2932
2933                 if (arg_boot) {
2934                         if (path_is_os_tree(arg_directory) <= 0) {
2935                                 log_error("Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", arg_directory);
2936                                 goto finish;
2937                         }
2938                 } else {
2939                         const char *p;
2940
2941                         p = strappenda(arg_directory,
2942                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2943                         if (access(p, F_OK) < 0) {
2944                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2945                                 goto finish;
2946
2947                         }
2948                 }
2949         } else {
2950                 char template[] = "/tmp/nspawn-root-XXXXXX";
2951
2952                 if (!mkdtemp(template)) {
2953                         log_error("Failed to create temporary directory: %m");
2954                         r = -errno;
2955                         goto finish;
2956                 }
2957
2958                 arg_directory = strdup(template);
2959                 if (!arg_directory) {
2960                         r = log_oom();
2961                         goto finish;
2962                 }
2963
2964                 image_fd = setup_image(&device_path, &loop_nr);
2965                 if (image_fd < 0) {
2966                         r = image_fd;
2967                         goto finish;
2968                 }
2969
2970                 r = dissect_image(image_fd,
2971                                   &root_device, &root_device_rw,
2972                                   &home_device, &home_device_rw,
2973                                   &srv_device, &srv_device_rw,
2974                                   &secondary);
2975                 if (r < 0)
2976                         goto finish;
2977         }
2978
2979         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2980         if (master < 0) {
2981                 log_error("Failed to acquire pseudo tty: %m");
2982                 goto finish;
2983         }
2984
2985         console = ptsname(master);
2986         if (!console) {
2987                 log_error("Failed to determine tty name: %m");
2988                 goto finish;
2989         }
2990
2991         if (!arg_quiet)
2992                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
2993                          arg_machine, arg_image ? arg_image : arg_directory);
2994
2995         if (unlockpt(master) < 0) {
2996                 log_error("Failed to unlock tty: %m");
2997                 goto finish;
2998         }
2999
3000         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
3001                 log_error("Failed to create kmsg socket pair: %m");
3002                 goto finish;
3003         }
3004
3005         sd_notify(false,
3006                   "READY=1\n"
3007                   "STATUS=Container running.");
3008
3009         assert_se(sigemptyset(&mask) == 0);
3010         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
3011         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
3012
3013         assert_se(sigemptyset(&mask_chld) == 0);
3014         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
3015
3016         for (;;) {
3017                 ContainerStatus container_status;
3018                 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
3019                 struct sigaction sa = {
3020                         .sa_handler = nop_handler,
3021                         .sa_flags = SA_NOCLDSTOP,
3022                 };
3023
3024                 r = barrier_create(&barrier);
3025                 if (r < 0) {
3026                         log_error_errno(r, "Cannot initialize IPC barrier: %m");
3027                         goto finish;
3028                 }
3029
3030                 /* Child can be killed before execv(), so handle SIGCHLD
3031                  * in order to interrupt parent's blocking calls and
3032                  * give it a chance to call wait() and terminate. */
3033                 r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
3034                 if (r < 0) {
3035                         log_error("Failed to change the signal mask: %m");
3036                         goto finish;
3037                 }
3038
3039                 r = sigaction(SIGCHLD, &sa, NULL);
3040                 if (r < 0) {
3041                         log_error("Failed to install SIGCHLD handler: %m");
3042                         goto finish;
3043                 }
3044
3045                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWNS|
3046                                           (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
3047                                           (arg_private_network ? CLONE_NEWNET : 0), NULL);
3048                 if (pid < 0) {
3049                         if (errno == EINVAL)
3050                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
3051                         else
3052                                 log_error("clone() failed: %m");
3053
3054                         r = pid;
3055                         goto finish;
3056                 }
3057
3058                 if (pid == 0) {
3059                         /* child */
3060                         _cleanup_free_ char *home = NULL;
3061                         unsigned n_env = 2;
3062                         const char *envp[] = {
3063                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
3064                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
3065                                 NULL, /* TERM */
3066                                 NULL, /* HOME */
3067                                 NULL, /* USER */
3068                                 NULL, /* LOGNAME */
3069                                 NULL, /* container_uuid */
3070                                 NULL, /* LISTEN_FDS */
3071                                 NULL, /* LISTEN_PID */
3072                                 NULL
3073                         };
3074                         char **env_use;
3075
3076                         barrier_set_role(&barrier, BARRIER_CHILD);
3077
3078                         envp[n_env] = strv_find_prefix(environ, "TERM=");
3079                         if (envp[n_env])
3080                                 n_env ++;
3081
3082                         master = safe_close(master);
3083
3084                         close_nointr(STDIN_FILENO);
3085                         close_nointr(STDOUT_FILENO);
3086                         close_nointr(STDERR_FILENO);
3087
3088                         kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
3089
3090                         reset_all_signal_handlers();
3091                         reset_signal_mask();
3092
3093                         k = open_terminal(console, O_RDWR);
3094                         if (k != STDIN_FILENO) {
3095                                 if (k >= 0) {
3096                                         safe_close(k);
3097                                         k = -EINVAL;
3098                                 }
3099
3100                                 log_error_errno(k, "Failed to open console: %m");
3101                                 _exit(EXIT_FAILURE);
3102                         }
3103
3104                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
3105                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
3106                                 log_error("Failed to duplicate console: %m");
3107                                 _exit(EXIT_FAILURE);
3108                         }
3109
3110                         if (setsid() < 0) {
3111                                 log_error("setsid() failed: %m");
3112                                 _exit(EXIT_FAILURE);
3113                         }
3114
3115                         if (reset_audit_loginuid() < 0)
3116                                 _exit(EXIT_FAILURE);
3117
3118                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
3119                                 log_error("PR_SET_PDEATHSIG failed: %m");
3120                                 _exit(EXIT_FAILURE);
3121                         }
3122
3123                         /* Mark everything as slave, so that we still