chiark / gitweb /
1051b922c93fb8e691d08e0fa0326c9e18cecc5e
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47 #include <linux/loop.h>
48
49 #ifdef HAVE_SELINUX
50 #include <selinux/selinux.h>
51 #endif
52
53 #ifdef HAVE_SECCOMP
54 #include <seccomp.h>
55 #endif
56
57 #ifdef HAVE_BLKID
58 #include <blkid/blkid.h>
59 #endif
60
61 #include "sd-daemon.h"
62 #include "sd-bus.h"
63 #include "sd-id128.h"
64 #include "sd-rtnl.h"
65 #include "log.h"
66 #include "util.h"
67 #include "mkdir.h"
68 #include "macro.h"
69 #include "audit.h"
70 #include "missing.h"
71 #include "cgroup-util.h"
72 #include "strv.h"
73 #include "path-util.h"
74 #include "loopback-setup.h"
75 #include "dev-setup.h"
76 #include "fdset.h"
77 #include "build.h"
78 #include "fileio.h"
79 #include "bus-util.h"
80 #include "bus-error.h"
81 #include "ptyfwd.h"
82 #include "bus-kernel.h"
83 #include "env-util.h"
84 #include "def.h"
85 #include "rtnl-util.h"
86 #include "udev-util.h"
87 #include "blkid-util.h"
88 #include "gpt.h"
89
90 #ifdef HAVE_SECCOMP
91 #include "seccomp-util.h"
92 #endif
93
94 typedef enum LinkJournal {
95         LINK_NO,
96         LINK_AUTO,
97         LINK_HOST,
98         LINK_GUEST
99 } LinkJournal;
100
101 static char *arg_directory = NULL;
102 static char *arg_user = NULL;
103 static sd_id128_t arg_uuid = {};
104 static char *arg_machine = NULL;
105 static const char *arg_selinux_context = NULL;
106 static const char *arg_selinux_apifs_context = NULL;
107 static const char *arg_slice = NULL;
108 static bool arg_private_network = false;
109 static bool arg_read_only = false;
110 static bool arg_boot = false;
111 static LinkJournal arg_link_journal = LINK_AUTO;
112 static uint64_t arg_retain =
113         (1ULL << CAP_CHOWN) |
114         (1ULL << CAP_DAC_OVERRIDE) |
115         (1ULL << CAP_DAC_READ_SEARCH) |
116         (1ULL << CAP_FOWNER) |
117         (1ULL << CAP_FSETID) |
118         (1ULL << CAP_IPC_OWNER) |
119         (1ULL << CAP_KILL) |
120         (1ULL << CAP_LEASE) |
121         (1ULL << CAP_LINUX_IMMUTABLE) |
122         (1ULL << CAP_NET_BIND_SERVICE) |
123         (1ULL << CAP_NET_BROADCAST) |
124         (1ULL << CAP_NET_RAW) |
125         (1ULL << CAP_SETGID) |
126         (1ULL << CAP_SETFCAP) |
127         (1ULL << CAP_SETPCAP) |
128         (1ULL << CAP_SETUID) |
129         (1ULL << CAP_SYS_ADMIN) |
130         (1ULL << CAP_SYS_CHROOT) |
131         (1ULL << CAP_SYS_NICE) |
132         (1ULL << CAP_SYS_PTRACE) |
133         (1ULL << CAP_SYS_TTY_CONFIG) |
134         (1ULL << CAP_SYS_RESOURCE) |
135         (1ULL << CAP_SYS_BOOT) |
136         (1ULL << CAP_AUDIT_WRITE) |
137         (1ULL << CAP_AUDIT_CONTROL) |
138         (1ULL << CAP_MKNOD);
139 static char **arg_bind = NULL;
140 static char **arg_bind_ro = NULL;
141 static char **arg_setenv = NULL;
142 static bool arg_quiet = false;
143 static bool arg_share_system = false;
144 static bool arg_register = true;
145 static bool arg_keep_unit = false;
146 static char **arg_network_interfaces = NULL;
147 static char **arg_network_macvlan = NULL;
148 static bool arg_network_veth = false;
149 static const char *arg_network_bridge = NULL;
150 static unsigned long arg_personality = 0xffffffffLU;
151 static const char *arg_image = NULL;
152
153 static int help(void) {
154
155         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
156                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
157                "  -h --help                 Show this help\n"
158                "     --version              Print version string\n"
159                "  -q --quiet                Do not show status information\n"
160                "  -D --directory=PATH       Root directory for the container\n"
161                "  -i --image=PATH           File system device or image for the container\n"
162                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
163                "  -u --user=USER            Run the command under specified user or uid\n"
164                "  -M --machine=NAME         Set the machine name for the container\n"
165                "     --uuid=UUID            Set a specific machine UUID for the container\n"
166                "  -S --slice=SLICE          Place the container in the specified slice\n"
167                "     --private-network      Disable network in container\n"
168                "     --network-interface=INTERFACE\n"
169                "                            Assign an existing network interface to the\n"
170                "                            container\n"
171                "     --network-macvlan=INTERFACE\n"
172                "                            Create a macvlan network interface based on an\n"
173                "                            existing network interface to the container\n"
174                "     --network-veth         Add a virtual ethernet connection between host\n"
175                "                            and container\n"
176                "     --network-bridge=INTERFACE\n"
177                "                            Add a virtual ethernet connection between host\n"
178                "                            and container and add it to an existing bridge on\n"
179                "                            the host\n"
180                "  -Z --selinux-context=SECLABEL\n"
181                "                            Set the SELinux security context to be used by\n"
182                "                            processes in the container\n"
183                "  -L --selinux-apifs-context=SECLABEL\n"
184                "                            Set the SELinux security context to be used by\n"
185                "                            API/tmpfs file systems in the container\n"
186                "     --capability=CAP       In addition to the default, retain specified\n"
187                "                            capability\n"
188                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
189                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
190                "  -j                        Equivalent to --link-journal=host\n"
191                "     --read-only            Mount the root directory read-only\n"
192                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
193                "                            the container\n"
194                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
195                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
196                "     --share-system         Share system namespaces with host\n"
197                "     --register=BOOLEAN     Register container as machine\n"
198                "     --keep-unit            Do not register a scope for the machine, reuse\n"
199                "                            the service unit nspawn is running in\n",
200                program_invocation_short_name);
201
202         return 0;
203 }
204
205 static int parse_argv(int argc, char *argv[]) {
206
207         enum {
208                 ARG_VERSION = 0x100,
209                 ARG_PRIVATE_NETWORK,
210                 ARG_UUID,
211                 ARG_READ_ONLY,
212                 ARG_CAPABILITY,
213                 ARG_DROP_CAPABILITY,
214                 ARG_LINK_JOURNAL,
215                 ARG_BIND,
216                 ARG_BIND_RO,
217                 ARG_SETENV,
218                 ARG_SHARE_SYSTEM,
219                 ARG_REGISTER,
220                 ARG_KEEP_UNIT,
221                 ARG_NETWORK_INTERFACE,
222                 ARG_NETWORK_MACVLAN,
223                 ARG_NETWORK_VETH,
224                 ARG_NETWORK_BRIDGE,
225                 ARG_PERSONALITY,
226         };
227
228         static const struct option options[] = {
229                 { "help",                  no_argument,       NULL, 'h'                   },
230                 { "version",               no_argument,       NULL, ARG_VERSION           },
231                 { "directory",             required_argument, NULL, 'D'                   },
232                 { "user",                  required_argument, NULL, 'u'                   },
233                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
234                 { "boot",                  no_argument,       NULL, 'b'                   },
235                 { "uuid",                  required_argument, NULL, ARG_UUID              },
236                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
237                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
238                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
239                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
240                 { "bind",                  required_argument, NULL, ARG_BIND              },
241                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
242                 { "machine",               required_argument, NULL, 'M'                   },
243                 { "slice",                 required_argument, NULL, 'S'                   },
244                 { "setenv",                required_argument, NULL, ARG_SETENV            },
245                 { "selinux-context",       required_argument, NULL, 'Z'                   },
246                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
247                 { "quiet",                 no_argument,       NULL, 'q'                   },
248                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
249                 { "register",              required_argument, NULL, ARG_REGISTER          },
250                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
251                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
252                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
253                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
254                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
255                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
256                 { "image",                 required_argument, NULL, 'i'                   },
257                 {}
258         };
259
260         int c, r;
261         uint64_t plus = 0, minus = 0;
262
263         assert(argc >= 0);
264         assert(argv);
265
266         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:qi:", options, NULL)) >= 0) {
267
268                 switch (c) {
269
270                 case 'h':
271                         return help();
272
273                 case ARG_VERSION:
274                         puts(PACKAGE_STRING);
275                         puts(SYSTEMD_FEATURES);
276                         return 0;
277
278                 case 'D':
279                         free(arg_directory);
280                         arg_directory = canonicalize_file_name(optarg);
281                         if (!arg_directory) {
282                                 log_error("Invalid root directory: %m");
283                                 return -ENOMEM;
284                         }
285
286                         break;
287
288                 case 'i':
289                         arg_image = optarg;
290                         break;
291
292                 case 'u':
293                         free(arg_user);
294                         arg_user = strdup(optarg);
295                         if (!arg_user)
296                                 return log_oom();
297
298                         break;
299
300                 case ARG_NETWORK_BRIDGE:
301                         arg_network_bridge = optarg;
302
303                         /* fall through */
304
305                 case ARG_NETWORK_VETH:
306                         arg_network_veth = true;
307                         arg_private_network = true;
308                         break;
309
310                 case ARG_NETWORK_INTERFACE:
311                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
312                                 return log_oom();
313
314                         arg_private_network = true;
315                         break;
316
317                 case ARG_NETWORK_MACVLAN:
318                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
319                                 return log_oom();
320
321                         /* fall through */
322
323                 case ARG_PRIVATE_NETWORK:
324                         arg_private_network = true;
325                         break;
326
327                 case 'b':
328                         arg_boot = true;
329                         break;
330
331                 case ARG_UUID:
332                         r = sd_id128_from_string(optarg, &arg_uuid);
333                         if (r < 0) {
334                                 log_error("Invalid UUID: %s", optarg);
335                                 return r;
336                         }
337                         break;
338
339                 case 'S':
340                         arg_slice = optarg;
341                         break;
342
343                 case 'M':
344                         if (isempty(optarg)) {
345                                 free(arg_machine);
346                                 arg_machine = NULL;
347                         } else {
348
349                                 if (!hostname_is_valid(optarg)) {
350                                         log_error("Invalid machine name: %s", optarg);
351                                         return -EINVAL;
352                                 }
353
354                                 free(arg_machine);
355                                 arg_machine = strdup(optarg);
356                                 if (!arg_machine)
357                                         return log_oom();
358
359                                 break;
360                         }
361
362                 case 'Z':
363                         arg_selinux_context = optarg;
364                         break;
365
366                 case 'L':
367                         arg_selinux_apifs_context = optarg;
368                         break;
369
370                 case ARG_READ_ONLY:
371                         arg_read_only = true;
372                         break;
373
374                 case ARG_CAPABILITY:
375                 case ARG_DROP_CAPABILITY: {
376                         char *state, *word;
377                         size_t length;
378
379                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
380                                 _cleanup_free_ char *t;
381                                 cap_value_t cap;
382
383                                 t = strndup(word, length);
384                                 if (!t)
385                                         return log_oom();
386
387                                 if (streq(t, "all")) {
388                                         if (c == ARG_CAPABILITY)
389                                                 plus = (uint64_t) -1;
390                                         else
391                                                 minus = (uint64_t) -1;
392                                 } else {
393                                         if (cap_from_name(t, &cap) < 0) {
394                                                 log_error("Failed to parse capability %s.", t);
395                                                 return -EINVAL;
396                                         }
397
398                                         if (c == ARG_CAPABILITY)
399                                                 plus |= 1ULL << (uint64_t) cap;
400                                         else
401                                                 minus |= 1ULL << (uint64_t) cap;
402                                 }
403                         }
404
405                         break;
406                 }
407
408                 case 'j':
409                         arg_link_journal = LINK_GUEST;
410                         break;
411
412                 case ARG_LINK_JOURNAL:
413                         if (streq(optarg, "auto"))
414                                 arg_link_journal = LINK_AUTO;
415                         else if (streq(optarg, "no"))
416                                 arg_link_journal = LINK_NO;
417                         else if (streq(optarg, "guest"))
418                                 arg_link_journal = LINK_GUEST;
419                         else if (streq(optarg, "host"))
420                                 arg_link_journal = LINK_HOST;
421                         else {
422                                 log_error("Failed to parse link journal mode %s", optarg);
423                                 return -EINVAL;
424                         }
425
426                         break;
427
428                 case ARG_BIND:
429                 case ARG_BIND_RO: {
430                         _cleanup_free_ char *a = NULL, *b = NULL;
431                         char *e;
432                         char ***x;
433
434                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
435
436                         e = strchr(optarg, ':');
437                         if (e) {
438                                 a = strndup(optarg, e - optarg);
439                                 b = strdup(e + 1);
440                         } else {
441                                 a = strdup(optarg);
442                                 b = strdup(optarg);
443                         }
444
445                         if (!a || !b)
446                                 return log_oom();
447
448                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
449                                 log_error("Invalid bind mount specification: %s", optarg);
450                                 return -EINVAL;
451                         }
452
453                         r = strv_extend(x, a);
454                         if (r < 0)
455                                 return log_oom();
456
457                         r = strv_extend(x, b);
458                         if (r < 0)
459                                 return log_oom();
460
461                         break;
462                 }
463
464                 case ARG_SETENV: {
465                         char **n;
466
467                         if (!env_assignment_is_valid(optarg)) {
468                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
469                                 return -EINVAL;
470                         }
471
472                         n = strv_env_set(arg_setenv, optarg);
473                         if (!n)
474                                 return log_oom();
475
476                         strv_free(arg_setenv);
477                         arg_setenv = n;
478                         break;
479                 }
480
481                 case 'q':
482                         arg_quiet = true;
483                         break;
484
485                 case ARG_SHARE_SYSTEM:
486                         arg_share_system = true;
487                         break;
488
489                 case ARG_REGISTER:
490                         r = parse_boolean(optarg);
491                         if (r < 0) {
492                                 log_error("Failed to parse --register= argument: %s", optarg);
493                                 return r;
494                         }
495
496                         arg_register = r;
497                         break;
498
499                 case ARG_KEEP_UNIT:
500                         arg_keep_unit = true;
501                         break;
502
503                 case ARG_PERSONALITY:
504
505                         arg_personality = personality_from_string(optarg);
506                         if (arg_personality == 0xffffffffLU) {
507                                 log_error("Unknown or unsupported personality '%s'.", optarg);
508                                 return -EINVAL;
509                         }
510
511                         break;
512
513                 case '?':
514                         return -EINVAL;
515
516                 default:
517                         assert_not_reached("Unhandled option");
518                 }
519         }
520
521         if (arg_share_system)
522                 arg_register = false;
523
524         if (arg_boot && arg_share_system) {
525                 log_error("--boot and --share-system may not be combined.");
526                 return -EINVAL;
527         }
528
529         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
530                 log_error("--keep-unit may not be used when invoked from a user session.");
531                 return -EINVAL;
532         }
533
534         if (arg_directory && arg_image) {
535                 log_error("--directory= and --image= may not be combined.");
536                 return -EINVAL;
537         }
538
539         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
540
541         return 1;
542 }
543
544 static int mount_all(const char *dest) {
545
546         typedef struct MountPoint {
547                 const char *what;
548                 const char *where;
549                 const char *type;
550                 const char *options;
551                 unsigned long flags;
552                 bool fatal;
553         } MountPoint;
554
555         static const MountPoint mount_table[] = {
556                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
557                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
558                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
559                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
560                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
561                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
562                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
563                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
564 #ifdef HAVE_SELINUX
565                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
566                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
567 #endif
568         };
569
570         unsigned k;
571         int r = 0;
572
573         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
574                 _cleanup_free_ char *where = NULL;
575 #ifdef HAVE_SELINUX
576                 _cleanup_free_ char *options = NULL;
577 #endif
578                 const char *o;
579                 int t;
580
581                 where = strjoin(dest, "/", mount_table[k].where, NULL);
582                 if (!where)
583                         return log_oom();
584
585                 t = path_is_mount_point(where, true);
586                 if (t < 0) {
587                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
588
589                         if (r == 0)
590                                 r = t;
591
592                         continue;
593                 }
594
595                 /* Skip this entry if it is not a remount. */
596                 if (mount_table[k].what && t > 0)
597                         continue;
598
599                 mkdir_p(where, 0755);
600
601 #ifdef HAVE_SELINUX
602                 if (arg_selinux_apifs_context &&
603                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
604                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
605                         if (!options)
606                                 return log_oom();
607
608                         o = options;
609                 } else
610 #endif
611                         o = mount_table[k].options;
612
613
614                 if (mount(mount_table[k].what,
615                           where,
616                           mount_table[k].type,
617                           mount_table[k].flags,
618                           o) < 0 &&
619                     mount_table[k].fatal) {
620
621                         log_error("mount(%s) failed: %m", where);
622
623                         if (r == 0)
624                                 r = -errno;
625                 }
626         }
627
628         return r;
629 }
630
631 static int mount_binds(const char *dest, char **l, unsigned long flags) {
632         char **x, **y;
633
634         STRV_FOREACH_PAIR(x, y, l) {
635                 char *where;
636                 struct stat source_st, dest_st;
637                 int r;
638
639                 if (stat(*x, &source_st) < 0) {
640                         log_error("Failed to stat %s: %m", *x);
641                         return -errno;
642                 }
643
644                 where = strappenda(dest, *y);
645                 r = stat(where, &dest_st);
646                 if (r == 0) {
647                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
648                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
649                                                 *x, where);
650                                 return -EINVAL;
651                         }
652                 } else if (errno == ENOENT) {
653                         r = mkdir_parents_label(where, 0755);
654                         if (r < 0) {
655                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
656                                 return r;
657                         }
658                 } else {
659                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
660                         return -errno;
661                 }
662                 /* Create the mount point, but be conservative -- refuse to create block
663                 * and char devices. */
664                 if (S_ISDIR(source_st.st_mode))
665                         mkdir_label(where, 0755);
666                 else if (S_ISFIFO(source_st.st_mode))
667                         mkfifo(where, 0644);
668                 else if (S_ISSOCK(source_st.st_mode))
669                         mknod(where, 0644 | S_IFSOCK, 0);
670                 else if (S_ISREG(source_st.st_mode))
671                         touch(where);
672                 else {
673                         log_error("Refusing to create mountpoint for file: %s", *x);
674                         return -ENOTSUP;
675                 }
676
677                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
678                         log_error("mount(%s) failed: %m", where);
679                         return -errno;
680                 }
681
682                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
683                         log_error("mount(%s) failed: %m", where);
684                         return -errno;
685                 }
686         }
687
688         return 0;
689 }
690
691 static int setup_timezone(const char *dest) {
692         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
693         char *z, *y;
694         int r;
695
696         assert(dest);
697
698         /* Fix the timezone, if possible */
699         r = readlink_malloc("/etc/localtime", &p);
700         if (r < 0) {
701                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
702                 return 0;
703         }
704
705         z = path_startswith(p, "../usr/share/zoneinfo/");
706         if (!z)
707                 z = path_startswith(p, "/usr/share/zoneinfo/");
708         if (!z) {
709                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
710                 return 0;
711         }
712
713         where = strappend(dest, "/etc/localtime");
714         if (!where)
715                 return log_oom();
716
717         r = readlink_malloc(where, &q);
718         if (r >= 0) {
719                 y = path_startswith(q, "../usr/share/zoneinfo/");
720                 if (!y)
721                         y = path_startswith(q, "/usr/share/zoneinfo/");
722
723
724                 /* Already pointing to the right place? Then do nothing .. */
725                 if (y && streq(y, z))
726                         return 0;
727         }
728
729         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
730         if (!check)
731                 return log_oom();
732
733         if (access(check, F_OK) < 0) {
734                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
735                 return 0;
736         }
737
738         what = strappend("../usr/share/zoneinfo/", z);
739         if (!what)
740                 return log_oom();
741
742         unlink(where);
743         if (symlink(what, where) < 0) {
744                 log_error("Failed to correct timezone of container: %m");
745                 return 0;
746         }
747
748         return 0;
749 }
750
751 static int setup_resolv_conf(const char *dest) {
752         char _cleanup_free_ *where = NULL;
753
754         assert(dest);
755
756         if (arg_private_network)
757                 return 0;
758
759         /* Fix resolv.conf, if possible */
760         where = strappend(dest, "/etc/resolv.conf");
761         if (!where)
762                 return log_oom();
763
764         /* We don't really care for the results of this really. If it
765          * fails, it fails, but meh... */
766         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
767
768         return 0;
769 }
770
771 static int setup_boot_id(const char *dest) {
772         _cleanup_free_ char *from = NULL, *to = NULL;
773         sd_id128_t rnd = {};
774         char as_uuid[37];
775         int r;
776
777         assert(dest);
778
779         if (arg_share_system)
780                 return 0;
781
782         /* Generate a new randomized boot ID, so that each boot-up of
783          * the container gets a new one */
784
785         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
786         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
787         if (!from || !to)
788                 return log_oom();
789
790         r = sd_id128_randomize(&rnd);
791         if (r < 0) {
792                 log_error("Failed to generate random boot id: %s", strerror(-r));
793                 return r;
794         }
795
796         snprintf(as_uuid, sizeof(as_uuid),
797                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
798                  SD_ID128_FORMAT_VAL(rnd));
799         char_array_0(as_uuid);
800
801         r = write_string_file(from, as_uuid);
802         if (r < 0) {
803                 log_error("Failed to write boot id: %s", strerror(-r));
804                 return r;
805         }
806
807         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
808                 log_error("Failed to bind mount boot id: %m");
809                 r = -errno;
810         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
811                 log_warning("Failed to make boot id read-only: %m");
812
813         unlink(from);
814         return r;
815 }
816
817 static int copy_devnodes(const char *dest) {
818
819         static const char devnodes[] =
820                 "null\0"
821                 "zero\0"
822                 "full\0"
823                 "random\0"
824                 "urandom\0"
825                 "tty\0";
826
827         const char *d;
828         int r = 0;
829         _cleanup_umask_ mode_t u;
830
831         assert(dest);
832
833         u = umask(0000);
834
835         NULSTR_FOREACH(d, devnodes) {
836                 _cleanup_free_ char *from = NULL, *to = NULL;
837                 struct stat st;
838
839                 from = strappend("/dev/", d);
840                 to = strjoin(dest, "/dev/", d, NULL);
841                 if (!from || !to)
842                         return log_oom();
843
844                 if (stat(from, &st) < 0) {
845
846                         if (errno != ENOENT) {
847                                 log_error("Failed to stat %s: %m", from);
848                                 return -errno;
849                         }
850
851                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
852
853                         log_error("%s is not a char or block device, cannot copy", from);
854                         return -EIO;
855
856                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
857
858                         log_error("mknod(%s) failed: %m", dest);
859                         return  -errno;
860                 }
861         }
862
863         return r;
864 }
865
866 static int setup_ptmx(const char *dest) {
867         _cleanup_free_ char *p = NULL;
868
869         p = strappend(dest, "/dev/ptmx");
870         if (!p)
871                 return log_oom();
872
873         if (symlink("pts/ptmx", p) < 0) {
874                 log_error("Failed to create /dev/ptmx symlink: %m");
875                 return -errno;
876         }
877
878         return 0;
879 }
880
881 static int setup_dev_console(const char *dest, const char *console) {
882         _cleanup_umask_ mode_t u;
883         const char *to;
884         struct stat st;
885         int r;
886
887         assert(dest);
888         assert(console);
889
890         u = umask(0000);
891
892         if (stat("/dev/null", &st) < 0) {
893                 log_error("Failed to stat /dev/null: %m");
894                 return -errno;
895         }
896
897         r = chmod_and_chown(console, 0600, 0, 0);
898         if (r < 0) {
899                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
900                 return r;
901         }
902
903         /* We need to bind mount the right tty to /dev/console since
904          * ptys can only exist on pts file systems. To have something
905          * to bind mount things on we create a device node first, and
906          * use /dev/null for that since we the cgroups device policy
907          * allows us to create that freely, while we cannot create
908          * /dev/console. (Note that the major minor doesn't actually
909          * matter here, since we mount it over anyway). */
910
911         to = strappenda(dest, "/dev/console");
912         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
913                 log_error("mknod() for /dev/console failed: %m");
914                 return -errno;
915         }
916
917         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
918                 log_error("Bind mount for /dev/console failed: %m");
919                 return -errno;
920         }
921
922         return 0;
923 }
924
925 static int setup_kmsg(const char *dest, int kmsg_socket) {
926         _cleanup_free_ char *from = NULL, *to = NULL;
927         int r, fd, k;
928         _cleanup_umask_ mode_t u;
929         union {
930                 struct cmsghdr cmsghdr;
931                 uint8_t buf[CMSG_SPACE(sizeof(int))];
932         } control = {};
933         struct msghdr mh = {
934                 .msg_control = &control,
935                 .msg_controllen = sizeof(control),
936         };
937         struct cmsghdr *cmsg;
938
939         assert(dest);
940         assert(kmsg_socket >= 0);
941
942         u = umask(0000);
943
944         /* We create the kmsg FIFO as /dev/kmsg, but immediately
945          * delete it after bind mounting it to /proc/kmsg. While FIFOs
946          * on the reading side behave very similar to /proc/kmsg,
947          * their writing side behaves differently from /dev/kmsg in
948          * that writing blocks when nothing is reading. In order to
949          * avoid any problems with containers deadlocking due to this
950          * we simply make /dev/kmsg unavailable to the container. */
951         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
952             asprintf(&to, "%s/proc/kmsg", dest) < 0)
953                 return log_oom();
954
955         if (mkfifo(from, 0600) < 0) {
956                 log_error("mkfifo() for /dev/kmsg failed: %m");
957                 return -errno;
958         }
959
960         r = chmod_and_chown(from, 0600, 0, 0);
961         if (r < 0) {
962                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
963                 return r;
964         }
965
966         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
967                 log_error("Bind mount for /proc/kmsg failed: %m");
968                 return -errno;
969         }
970
971         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
972         if (fd < 0) {
973                 log_error("Failed to open fifo: %m");
974                 return -errno;
975         }
976
977         cmsg = CMSG_FIRSTHDR(&mh);
978         cmsg->cmsg_level = SOL_SOCKET;
979         cmsg->cmsg_type = SCM_RIGHTS;
980         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
981         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
982
983         mh.msg_controllen = cmsg->cmsg_len;
984
985         /* Store away the fd in the socket, so that it stays open as
986          * long as we run the child */
987         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
988         close_nointr_nofail(fd);
989
990         if (k < 0) {
991                 log_error("Failed to send FIFO fd: %m");
992                 return -errno;
993         }
994
995         /* And now make the FIFO unavailable as /dev/kmsg... */
996         unlink(from);
997         return 0;
998 }
999
1000 static int setup_hostname(void) {
1001
1002         if (arg_share_system)
1003                 return 0;
1004
1005         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
1006                 return -errno;
1007
1008         return 0;
1009 }
1010
1011 static int setup_journal(const char *directory) {
1012         sd_id128_t machine_id, this_id;
1013         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1014         char *id;
1015         int r;
1016
1017         p = strappend(directory, "/etc/machine-id");
1018         if (!p)
1019                 return log_oom();
1020
1021         r = read_one_line_file(p, &b);
1022         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1023                 return 0;
1024         else if (r < 0) {
1025                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1026                 return r;
1027         }
1028
1029         id = strstrip(b);
1030         if (isempty(id) && arg_link_journal == LINK_AUTO)
1031                 return 0;
1032
1033         /* Verify validity */
1034         r = sd_id128_from_string(id, &machine_id);
1035         if (r < 0) {
1036                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1037                 return r;
1038         }
1039
1040         r = sd_id128_get_machine(&this_id);
1041         if (r < 0) {
1042                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1043                 return r;
1044         }
1045
1046         if (sd_id128_equal(machine_id, this_id)) {
1047                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1048                          "Host and machine ids are equal (%s): refusing to link journals", id);
1049                 if (arg_link_journal == LINK_AUTO)
1050                         return 0;
1051                 return
1052                         -EEXIST;
1053         }
1054
1055         if (arg_link_journal == LINK_NO)
1056                 return 0;
1057
1058         free(p);
1059         p = strappend("/var/log/journal/", id);
1060         q = strjoin(directory, "/var/log/journal/", id, NULL);
1061         if (!p || !q)
1062                 return log_oom();
1063
1064         if (path_is_mount_point(p, false) > 0) {
1065                 if (arg_link_journal != LINK_AUTO) {
1066                         log_error("%s: already a mount point, refusing to use for journal", p);
1067                         return -EEXIST;
1068                 }
1069
1070                 return 0;
1071         }
1072
1073         if (path_is_mount_point(q, false) > 0) {
1074                 if (arg_link_journal != LINK_AUTO) {
1075                         log_error("%s: already a mount point, refusing to use for journal", q);
1076                         return -EEXIST;
1077                 }
1078
1079                 return 0;
1080         }
1081
1082         r = readlink_and_make_absolute(p, &d);
1083         if (r >= 0) {
1084                 if ((arg_link_journal == LINK_GUEST ||
1085                      arg_link_journal == LINK_AUTO) &&
1086                     path_equal(d, q)) {
1087
1088                         r = mkdir_p(q, 0755);
1089                         if (r < 0)
1090                                 log_warning("failed to create directory %s: %m", q);
1091                         return 0;
1092                 }
1093
1094                 if (unlink(p) < 0) {
1095                         log_error("Failed to remove symlink %s: %m", p);
1096                         return -errno;
1097                 }
1098         } else if (r == -EINVAL) {
1099
1100                 if (arg_link_journal == LINK_GUEST &&
1101                     rmdir(p) < 0) {
1102
1103                         if (errno == ENOTDIR) {
1104                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1105                                 return r;
1106                         } else {
1107                                 log_error("Failed to remove %s: %m", p);
1108                                 return -errno;
1109                         }
1110                 }
1111         } else if (r != -ENOENT) {
1112                 log_error("readlink(%s) failed: %m", p);
1113                 return r;
1114         }
1115
1116         if (arg_link_journal == LINK_GUEST) {
1117
1118                 if (symlink(q, p) < 0) {
1119                         log_error("Failed to symlink %s to %s: %m", q, p);
1120                         return -errno;
1121                 }
1122
1123                 r = mkdir_p(q, 0755);
1124                 if (r < 0)
1125                         log_warning("failed to create directory %s: %m", q);
1126                 return 0;
1127         }
1128
1129         if (arg_link_journal == LINK_HOST) {
1130                 r = mkdir_p(p, 0755);
1131                 if (r < 0) {
1132                         log_error("Failed to create %s: %m", p);
1133                         return r;
1134                 }
1135
1136         } else if (access(p, F_OK) < 0)
1137                 return 0;
1138
1139         if (dir_is_empty(q) == 0) {
1140                 log_error("%s not empty.", q);
1141                 return -ENOTEMPTY;
1142         }
1143
1144         r = mkdir_p(q, 0755);
1145         if (r < 0) {
1146                 log_error("Failed to create %s: %m", q);
1147                 return r;
1148         }
1149
1150         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1151                 log_error("Failed to bind mount journal from host into guest: %m");
1152                 return -errno;
1153         }
1154
1155         return 0;
1156 }
1157
1158 static int setup_kdbus(const char *dest, const char *path) {
1159         const char *p;
1160
1161         if (!path)
1162                 return 0;
1163
1164         p = strappenda(dest, "/dev/kdbus");
1165         if (mkdir(p, 0755) < 0) {
1166                 log_error("Failed to create kdbus path: %m");
1167                 return  -errno;
1168         }
1169
1170         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1171                 log_error("Failed to mount kdbus domain path: %m");
1172                 return -errno;
1173         }
1174
1175         return 0;
1176 }
1177
1178 static int drop_capabilities(void) {
1179         return capability_bounding_set_drop(~arg_retain, false);
1180 }
1181
1182 static int register_machine(pid_t pid) {
1183         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1184         _cleanup_bus_unref_ sd_bus *bus = NULL;
1185         int r;
1186
1187         if (!arg_register)
1188                 return 0;
1189
1190         r = sd_bus_default_system(&bus);
1191         if (r < 0) {
1192                 log_error("Failed to open system bus: %s", strerror(-r));
1193                 return r;
1194         }
1195
1196         if (arg_keep_unit) {
1197                 r = sd_bus_call_method(
1198                                 bus,
1199                                 "org.freedesktop.machine1",
1200                                 "/org/freedesktop/machine1",
1201                                 "org.freedesktop.machine1.Manager",
1202                                 "RegisterMachine",
1203                                 &error,
1204                                 NULL,
1205                                 "sayssus",
1206                                 arg_machine,
1207                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1208                                 "nspawn",
1209                                 "container",
1210                                 (uint32_t) pid,
1211                                 strempty(arg_directory));
1212         } else {
1213                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1214
1215                 r = sd_bus_message_new_method_call(
1216                                 bus,
1217                                 &m,
1218                                 "org.freedesktop.machine1",
1219                                 "/org/freedesktop/machine1",
1220                                 "org.freedesktop.machine1.Manager",
1221                                 "CreateMachine");
1222                 if (r < 0) {
1223                         log_error("Failed to create message: %s", strerror(-r));
1224                         return r;
1225                 }
1226
1227                 r = sd_bus_message_append(
1228                                 m,
1229                                 "sayssus",
1230                                 arg_machine,
1231                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1232                                 "nspawn",
1233                                 "container",
1234                                 (uint32_t) pid,
1235                                 strempty(arg_directory));
1236                 if (r < 0) {
1237                         log_error("Failed to append message arguments: %s", strerror(-r));
1238                         return r;
1239                 }
1240
1241                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1242                 if (r < 0) {
1243                         log_error("Failed to open container: %s", strerror(-r));
1244                         return r;
1245                 }
1246
1247                 if (!isempty(arg_slice)) {
1248                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1249                         if (r < 0) {
1250                                 log_error("Failed to append slice: %s", strerror(-r));
1251                                 return r;
1252                         }
1253                 }
1254
1255                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1256                 if (r < 0) {
1257                         log_error("Failed to add device policy: %s", strerror(-r));
1258                         return r;
1259                 }
1260
1261                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 10,
1262                                           /* Allow the container to
1263                                            * access and create the API
1264                                            * device nodes, so that
1265                                            * PrivateDevices= in the
1266                                            * container can work
1267                                            * fine */
1268                                           "/dev/null", "rwm",
1269                                           "/dev/zero", "rwm",
1270                                           "/dev/full", "rwm",
1271                                           "/dev/random", "rwm",
1272                                           "/dev/urandom", "rwm",
1273                                           "/dev/tty", "rwm",
1274                                           /* Allow the container
1275                                            * access to ptys. However,
1276                                            * do not permit the
1277                                            * container to ever create
1278                                            * these device nodes. */
1279                                           "/dev/pts/ptmx", "rw",
1280                                           "char-pts", "rw",
1281                                           /* Allow the container
1282                                            * access to all kdbus
1283                                            * devices. Again, the
1284                                            * container cannot create
1285                                            * these nodes, only use
1286                                            * them. We use a pretty
1287                                            * open match here, so that
1288                                            * the kernel API can still
1289                                            * change. */
1290                                           "char-kdbus", "rw",
1291                                           "char-kdbus/*", "rw");
1292                 if (r < 0) {
1293                         log_error("Failed to add device whitelist: %s", strerror(-r));
1294                         return r;
1295                 }
1296
1297                 r = sd_bus_message_close_container(m);
1298                 if (r < 0) {
1299                         log_error("Failed to close container: %s", strerror(-r));
1300                         return r;
1301                 }
1302
1303                 r = sd_bus_call(bus, m, 0, &error, NULL);
1304         }
1305
1306         if (r < 0) {
1307                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1308                 return r;
1309         }
1310
1311         return 0;
1312 }
1313
1314 static int terminate_machine(pid_t pid) {
1315         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1316         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1317         _cleanup_bus_unref_ sd_bus *bus = NULL;
1318         const char *path;
1319         int r;
1320
1321         if (!arg_register)
1322                 return 0;
1323
1324         r = sd_bus_default_system(&bus);
1325         if (r < 0) {
1326                 log_error("Failed to open system bus: %s", strerror(-r));
1327                 return r;
1328         }
1329
1330         r = sd_bus_call_method(
1331                         bus,
1332                         "org.freedesktop.machine1",
1333                         "/org/freedesktop/machine1",
1334                         "org.freedesktop.machine1.Manager",
1335                         "GetMachineByPID",
1336                         &error,
1337                         &reply,
1338                         "u",
1339                         (uint32_t) pid);
1340         if (r < 0) {
1341                 /* Note that the machine might already have been
1342                  * cleaned up automatically, hence don't consider it a
1343                  * failure if we cannot get the machine object. */
1344                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1345                 return 0;
1346         }
1347
1348         r = sd_bus_message_read(reply, "o", &path);
1349         if (r < 0)
1350                 return bus_log_parse_error(r);
1351
1352         r = sd_bus_call_method(
1353                         bus,
1354                         "org.freedesktop.machine1",
1355                         path,
1356                         "org.freedesktop.machine1.Machine",
1357                         "Terminate",
1358                         &error,
1359                         NULL,
1360                         NULL);
1361         if (r < 0) {
1362                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1363                 return 0;
1364         }
1365
1366         return 0;
1367 }
1368
1369 static int reset_audit_loginuid(void) {
1370         _cleanup_free_ char *p = NULL;
1371         int r;
1372
1373         if (arg_share_system)
1374                 return 0;
1375
1376         r = read_one_line_file("/proc/self/loginuid", &p);
1377         if (r == -ENOENT)
1378                 return 0;
1379         if (r < 0) {
1380                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1381                 return r;
1382         }
1383
1384         /* Already reset? */
1385         if (streq(p, "4294967295"))
1386                 return 0;
1387
1388         r = write_string_file("/proc/self/loginuid", "4294967295");
1389         if (r < 0) {
1390                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1391                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1392                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1393                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1394                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1395
1396                 sleep(5);
1397         }
1398
1399         return 0;
1400 }
1401
1402 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1403         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1404         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1405         int r;
1406
1407         if (!arg_private_network)
1408                 return 0;
1409
1410         if (!arg_network_veth)
1411                 return 0;
1412
1413         /* Use two different interface name prefixes depending whether
1414          * we are in bridge mode or not. */
1415         if (arg_network_bridge)
1416                 memcpy(iface_name, "vb-", 3);
1417         else
1418                 memcpy(iface_name, "ve-", 3);
1419
1420         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1421
1422         r = sd_rtnl_open(&rtnl, 0);
1423         if (r < 0) {
1424                 log_error("Failed to connect to netlink: %s", strerror(-r));
1425                 return r;
1426         }
1427
1428         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1429         if (r < 0) {
1430                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1431                 return r;
1432         }
1433
1434         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1435         if (r < 0) {
1436                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1437                 return r;
1438         }
1439
1440         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1441         if (r < 0) {
1442                 log_error("Failed to open netlink container: %s", strerror(-r));
1443                 return r;
1444         }
1445
1446         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1447         if (r < 0) {
1448                 log_error("Failed to append netlink kind: %s", strerror(-r));
1449                 return r;
1450         }
1451
1452         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1453         if (r < 0) {
1454                 log_error("Failed to open netlink container: %s", strerror(-r));
1455                 return r;
1456         }
1457
1458         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1459         if (r < 0) {
1460                 log_error("Failed to open netlink container: %s", strerror(-r));
1461                 return r;
1462         }
1463
1464         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1465         if (r < 0) {
1466                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1467                 return r;
1468         }
1469
1470         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1471         if (r < 0) {
1472                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1473                 return r;
1474         }
1475
1476         r = sd_rtnl_message_close_container(m);
1477         if (r < 0) {
1478                 log_error("Failed to close netlink container: %s", strerror(-r));
1479                 return r;
1480         }
1481
1482         r = sd_rtnl_message_close_container(m);
1483         if (r < 0) {
1484                 log_error("Failed to close netlink container: %s", strerror(-r));
1485                 return r;
1486         }
1487
1488         r = sd_rtnl_message_close_container(m);
1489         if (r < 0) {
1490                 log_error("Failed to close netlink container: %s", strerror(-r));
1491                 return r;
1492         }
1493
1494         r = sd_rtnl_call(rtnl, m, 0, NULL);
1495         if (r < 0) {
1496                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1497                 return r;
1498         }
1499
1500         return 0;
1501 }
1502
1503 static int setup_bridge(const char veth_name[]) {
1504         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1505         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1506         int r, bridge;
1507
1508         if (!arg_private_network)
1509                 return 0;
1510
1511         if (!arg_network_veth)
1512                 return 0;
1513
1514         if (!arg_network_bridge)
1515                 return 0;
1516
1517         bridge = (int) if_nametoindex(arg_network_bridge);
1518         if (bridge <= 0) {
1519                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1520                 return -errno;
1521         }
1522
1523         r = sd_rtnl_open(&rtnl, 0);
1524         if (r < 0) {
1525                 log_error("Failed to connect to netlink: %s", strerror(-r));
1526                 return r;
1527         }
1528
1529         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1530         if (r < 0) {
1531                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1532                 return r;
1533         }
1534
1535         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1536         if (r < 0) {
1537                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1538                 return r;
1539         }
1540
1541         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1542         if (r < 0) {
1543                 log_error("Failed to add netlink master field: %s", strerror(-r));
1544                 return r;
1545         }
1546
1547         r = sd_rtnl_call(rtnl, m, 0, NULL);
1548         if (r < 0) {
1549                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1550                 return r;
1551         }
1552
1553         return 0;
1554 }
1555
1556 static int parse_interface(struct udev *udev, const char *name) {
1557         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1558         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1559         int ifi;
1560
1561         ifi = (int) if_nametoindex(name);
1562         if (ifi <= 0) {
1563                 log_error("Failed to resolve interface %s: %m", name);
1564                 return -errno;
1565         }
1566
1567         sprintf(ifi_str, "n%i", ifi);
1568         d = udev_device_new_from_device_id(udev, ifi_str);
1569         if (!d) {
1570                 log_error("Failed to get udev device for interface %s: %m", name);
1571                 return -errno;
1572         }
1573
1574         if (udev_device_get_is_initialized(d) <= 0) {
1575                 log_error("Network interface %s is not initialized yet.", name);
1576                 return -EBUSY;
1577         }
1578
1579         return ifi;
1580 }
1581
1582 static int move_network_interfaces(pid_t pid) {
1583         _cleanup_udev_unref_ struct udev *udev = NULL;
1584         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1585         char **i;
1586         int r;
1587
1588         if (!arg_private_network)
1589                 return 0;
1590
1591         if (strv_isempty(arg_network_interfaces))
1592                 return 0;
1593
1594         r = sd_rtnl_open(&rtnl, 0);
1595         if (r < 0) {
1596                 log_error("Failed to connect to netlink: %s", strerror(-r));
1597                 return r;
1598         }
1599
1600         udev = udev_new();
1601         if (!udev) {
1602                 log_error("Failed to connect to udev.");
1603                 return -ENOMEM;
1604         }
1605
1606         STRV_FOREACH(i, arg_network_interfaces) {
1607                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1608                 int ifi;
1609
1610                 ifi = parse_interface(udev, *i);
1611                 if (ifi < 0)
1612                         return ifi;
1613
1614                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1615                 if (r < 0) {
1616                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1617                         return r;
1618                 }
1619
1620                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1621                 if (r < 0) {
1622                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1623                         return r;
1624                 }
1625
1626                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1627                 if (r < 0) {
1628                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1629                         return r;
1630                 }
1631         }
1632
1633         return 0;
1634 }
1635
1636 static int setup_macvlan(pid_t pid) {
1637         _cleanup_udev_unref_ struct udev *udev = NULL;
1638         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1639         char **i;
1640         int r;
1641
1642         if (!arg_private_network)
1643                 return 0;
1644
1645         if (strv_isempty(arg_network_macvlan))
1646                 return 0;
1647
1648         r = sd_rtnl_open(&rtnl, 0);
1649         if (r < 0) {
1650                 log_error("Failed to connect to netlink: %s", strerror(-r));
1651                 return r;
1652         }
1653
1654         udev = udev_new();
1655         if (!udev) {
1656                 log_error("Failed to connect to udev.");
1657                 return -ENOMEM;
1658         }
1659
1660         STRV_FOREACH(i, arg_network_macvlan) {
1661                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1662                 _cleanup_free_ char *n = NULL;
1663                 int ifi;
1664
1665                 ifi = parse_interface(udev, *i);
1666                 if (ifi < 0)
1667                         return ifi;
1668
1669                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1670                 if (r < 0) {
1671                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1672                         return r;
1673                 }
1674
1675                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1676                 if (r < 0) {
1677                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1678                         return r;
1679                 }
1680
1681                 n = strappend("mv-", *i);
1682                 if (!n)
1683                         return log_oom();
1684
1685                 strshorten(n, IFNAMSIZ-1);
1686
1687                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1688                 if (r < 0) {
1689                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1690                         return r;
1691                 }
1692
1693                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1694                 if (r < 0) {
1695                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1696                         return r;
1697                 }
1698
1699                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1700                 if (r < 0) {
1701                         log_error("Failed to open netlink container: %s", strerror(-r));
1702                         return r;
1703                 }
1704
1705                 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1706                 if (r < 0) {
1707                         log_error("Failed to append netlink kind: %s", strerror(-r));
1708                         return r;
1709                 }
1710
1711                 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1712                 if (r < 0) {
1713                         log_error("Failed to open netlink container: %s", strerror(-r));
1714                         return r;
1715                 }
1716
1717                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1718                 if (r < 0) {
1719                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1720                         return r;
1721                 }
1722
1723                 r = sd_rtnl_message_close_container(m);
1724                 if (r < 0) {
1725                         log_error("Failed to close netlink container: %s", strerror(-r));
1726                         return r;
1727                 }
1728
1729                 r = sd_rtnl_message_close_container(m);
1730                 if (r < 0) {
1731                         log_error("Failed to close netlink container: %s", strerror(-r));
1732                         return r;
1733                 }
1734
1735                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1736                 if (r < 0) {
1737                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1738                         return r;
1739                 }
1740         }
1741
1742         return 0;
1743 }
1744
1745 static int audit_still_doesnt_work_in_containers(void) {
1746
1747 #ifdef HAVE_SECCOMP
1748         scmp_filter_ctx seccomp;
1749         int r;
1750
1751         /*
1752            Audit is broken in containers, much of the userspace audit
1753            hookup will fail if running inside a container. We don't
1754            care and just turn off creation of audit sockets.
1755
1756            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1757            with EAFNOSUPPORT which audit userspace uses as indication
1758            that audit is disabled in the kernel.
1759          */
1760
1761         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1762         if (!seccomp)
1763                 return log_oom();
1764
1765         r = seccomp_add_secondary_archs(seccomp);
1766         if (r < 0) {
1767                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1768                 goto finish;
1769         }
1770
1771         r = seccomp_rule_add(
1772                         seccomp,
1773                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1774                         SCMP_SYS(socket),
1775                         2,
1776                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1777                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1778         if (r < 0) {
1779                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1780                 goto finish;
1781         }
1782
1783         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1784         if (r < 0) {
1785                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1786                 goto finish;
1787         }
1788
1789         r = seccomp_load(seccomp);
1790         if (r < 0)
1791                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1792
1793 finish:
1794         seccomp_release(seccomp);
1795         return r;
1796 #else
1797         return 0;
1798 #endif
1799
1800 }
1801
1802 static int setup_image(char **device_path, int *loop_nr) {
1803         struct loop_info64 info = {
1804                 .lo_flags = LO_FLAGS_AUTOCLEAR|LO_FLAGS_PARTSCAN
1805         };
1806         _cleanup_close_ int fd = -1, control = -1, loop = -1;
1807         _cleanup_free_ char* loopdev = NULL;
1808         struct stat st;
1809         int r, nr;
1810
1811         assert(device_path);
1812         assert(loop_nr);
1813
1814         fd = open(arg_image, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1815         if (fd < 0) {
1816                 log_error("Failed to open %s: %m", arg_image);
1817                 return -errno;
1818         }
1819
1820         if (fstat(fd, &st) < 0) {
1821                 log_error("Failed to stat %s: %m", arg_image);
1822                 return -errno;
1823         }
1824
1825         if (S_ISBLK(st.st_mode)) {
1826                 char *p;
1827
1828                 p = strdup(arg_image);
1829                 if (!p)
1830                         return log_oom();
1831
1832                 *device_path = p;
1833
1834                 *loop_nr = -1;
1835
1836                 r = fd;
1837                 fd = -1;
1838
1839                 return r;
1840         }
1841
1842         if (!S_ISREG(st.st_mode)) {
1843                 log_error("%s is not a regular file or block device: %m", arg_image);
1844                 return -EINVAL;
1845         }
1846
1847         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
1848         if (control < 0) {
1849                 log_error("Failed to open /dev/loop-control: %m");
1850                 return -errno;
1851         }
1852
1853         nr = ioctl(control, LOOP_CTL_GET_FREE);
1854         if (nr < 0) {
1855                 log_error("Failed to allocate loop device: %m");
1856                 return -errno;
1857         }
1858
1859         if (asprintf(&loopdev, "/dev/loop%i", nr) < 0)
1860                 return log_oom();
1861
1862         loop = open(loopdev, O_CLOEXEC|(arg_read_only ? O_RDONLY : O_RDWR)|O_NONBLOCK|O_NOCTTY);
1863         if (loop < 0) {
1864                 log_error("Failed to open loop device %s: %m", loopdev);
1865                 return -errno;
1866         }
1867
1868         if (ioctl(loop, LOOP_SET_FD, fd) < 0) {
1869                 log_error("Failed to set loopback file descriptor on %s: %m", loopdev);
1870                 return -errno;
1871         }
1872
1873         if (arg_read_only)
1874                 info.lo_flags |= LO_FLAGS_READ_ONLY;
1875
1876         if (ioctl(loop, LOOP_SET_STATUS64, &info) < 0) {
1877                 log_error("Failed to set loopback settings on %s: %m", loopdev);
1878                 return -errno;
1879         }
1880
1881         *device_path = loopdev;
1882         loopdev = NULL;
1883
1884         *loop_nr = nr;
1885
1886         r = loop;
1887         loop = -1;
1888
1889         return r;
1890 }
1891
1892 static int dissect_image(
1893                 int fd,
1894                 char **root_device,
1895                 char **home_device,
1896                 char **srv_device,
1897                 bool *secondary) {
1898
1899 #ifdef HAVE_BLKID
1900         int home_nr = -1, root_nr = -1, secondary_root_nr = -1, srv_nr = -1;
1901         _cleanup_free_ char *home = NULL, *root = NULL, *secondary_root = NULL, *srv = NULL;
1902         _cleanup_udev_enumerate_unref_ struct udev_enumerate *e = NULL;
1903         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1904         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
1905         _cleanup_udev_unref_ struct udev *udev = NULL;
1906         struct udev_list_entry *first, *item;
1907         const char *pttype = NULL;
1908         blkid_partlist pl;
1909         struct stat st;
1910         int r;
1911
1912         assert(fd >= 0);
1913         assert(root_device);
1914         assert(home_device);
1915         assert(srv_device);
1916         assert(secondary);
1917
1918         b = blkid_new_probe();
1919         if (!b)
1920                 return log_oom();
1921
1922         errno = 0;
1923         r = blkid_probe_set_device(b, fd, 0, 0);
1924         if (r != 0) {
1925                 if (errno == 0)
1926                         return log_oom();
1927
1928                 log_error("Failed to set device on blkid probe: %m");
1929                 return -errno;
1930         }
1931
1932         blkid_probe_enable_partitions(b, 1);
1933         blkid_probe_set_partitions_flags(b, BLKID_PARTS_ENTRY_DETAILS);
1934
1935         errno = 0;
1936         r = blkid_do_safeprobe(b);
1937         if (r == -2 || r == 1) {
1938                 log_error("Failed to identify any partition table on %s.\n"
1939                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1940                 return -EINVAL;
1941         } else if (r != 0) {
1942                 if (errno == 0)
1943                         errno = EIO;
1944                 log_error("Failed to probe: %m");
1945                 return -errno;
1946         }
1947
1948         blkid_probe_lookup_value(b, "PTTYPE", &pttype, NULL);
1949         if (!streq_ptr(pttype, "gpt")) {
1950                 log_error("Image %s does not carry a GUID Partition Table.\n"
1951                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
1952                 return -EINVAL;
1953         }
1954
1955         errno = 0;
1956         pl = blkid_probe_get_partitions(b);
1957         if (!pl) {
1958                 if (errno == 0)
1959                         return log_oom();
1960
1961                 log_error("Failed to list partitions of %s", arg_image);
1962                 return -errno;
1963         }
1964
1965         udev = udev_new();
1966         if (!udev)
1967                 return log_oom();
1968
1969         if (fstat(fd, &st) < 0) {
1970                 log_error("Failed to stat block device: %m");
1971                 return -errno;
1972         }
1973
1974         d = udev_device_new_from_devnum(udev, 'b', st.st_rdev);
1975         if (!d)
1976                 return log_oom();
1977
1978         e = udev_enumerate_new(udev);
1979         if (!e)
1980                 return log_oom();
1981
1982         r = udev_enumerate_add_match_parent(e, d);
1983         if (r < 0)
1984                 return log_oom();
1985
1986         r = udev_enumerate_scan_devices(e);
1987         if (r < 0) {
1988                 log_error("Failed to scan for partition devices of %s: %s", arg_image, strerror(-r));
1989                 return r;
1990         }
1991
1992         first = udev_enumerate_get_list_entry(e);
1993         udev_list_entry_foreach(item, first) {
1994                 _cleanup_udev_device_unref_ struct udev_device *q;
1995                 const char *stype, *node;
1996                 sd_id128_t type_id;
1997                 blkid_partition pp;
1998                 dev_t qn;
1999                 int nr;
2000
2001                 errno = 0;
2002                 q = udev_device_new_from_syspath(udev, udev_list_entry_get_name(item));
2003                 if (!q) {
2004                         if (!errno)
2005                                 errno = ENOMEM;
2006
2007                         log_error("Failed to get partition device of %s: %m", arg_image);
2008                         return -errno;
2009                 }
2010
2011                 qn = udev_device_get_devnum(q);
2012                 if (major(qn) == 0)
2013                         continue;
2014
2015                 if (st.st_rdev == qn)
2016                         continue;
2017
2018                 node = udev_device_get_devnode(q);
2019                 if (!node)
2020                         continue;
2021
2022                 pp = blkid_partlist_devno_to_partition(pl, qn);
2023                 if (!pp)
2024                         continue;
2025
2026                 nr = blkid_partition_get_partno(pp);
2027                 if (nr < 0)
2028                         continue;
2029
2030                 stype = blkid_partition_get_type_string(pp);
2031                 if (!stype)
2032                         continue;
2033
2034                 if (sd_id128_from_string(stype, &type_id) < 0)
2035                         continue;
2036
2037                 if (sd_id128_equal(type_id, GPT_HOME)) {
2038
2039                         if (home && nr >= home_nr)
2040                                 continue;
2041
2042                         home_nr = nr;
2043                         free(home);
2044                         home = strdup(node);
2045                         if (!home)
2046                                 return log_oom();
2047                 } else if (sd_id128_equal(type_id, GPT_SRV)) {
2048
2049                         if (srv && nr >= srv_nr)
2050                                 continue;
2051
2052                         srv_nr = nr;
2053                         free(srv);
2054                         srv = strdup(node);
2055                         if (!srv)
2056                                 return log_oom();
2057                 }
2058 #ifdef GPT_ROOT_NATIVE
2059                 else if (sd_id128_equal(type_id, GPT_ROOT_NATIVE)) {
2060
2061                         if (root && nr >= root_nr)
2062                                 continue;
2063
2064                         root_nr = nr;
2065                         free(root);
2066                         root = strdup(node);
2067                         if (!root)
2068                                 return log_oom();
2069                 }
2070 #endif
2071 #ifdef GPT_ROOT_SECONDARY
2072                 else if (sd_id128_equal(type_id, GPT_ROOT_SECONDARY)) {
2073
2074                         if (secondary_root && nr >= secondary_root_nr)
2075                                 continue;
2076
2077                         secondary_root_nr = nr;
2078                         free(secondary_root);
2079                         secondary_root = strdup(node);
2080                         if (!secondary_root)
2081                                 return log_oom();
2082                 }
2083 #endif
2084         }
2085
2086         if (!root && !secondary_root) {
2087                 log_error("Failed to identify root partition in disk image %s.\n"
2088                           "Note that the disk image needs to follow http://www.freedesktop.org/wiki/Specifications/DiscoverablePartitionsSpec/ to be supported by systemd-nspawn.", arg_image);
2089                 return -EINVAL;
2090         }
2091
2092         if (root) {
2093                 *root_device = root;
2094                 root = NULL;
2095                 *secondary = false;
2096         } else if (secondary_root) {
2097                 *root_device = secondary_root;
2098                 secondary_root = NULL;
2099                 *secondary = true;
2100         }
2101
2102         if (home) {
2103                 *home_device = home;
2104                 home = NULL;
2105         }
2106
2107         if (srv) {
2108                 *srv_device = srv;
2109                 srv = NULL;
2110         }
2111
2112         return 0;
2113 #else
2114         log_error("--image= is not supported, compiled without blkid support.");
2115         return -ENOTSUP;
2116 #endif
2117 }
2118
2119 static int mount_device(const char *what, const char *where, const char *directory) {
2120 #ifdef HAVE_BLKID
2121         _cleanup_blkid_free_probe_ blkid_probe b = NULL;
2122         const char *fstype, *p;
2123         int r;
2124
2125         assert(what);
2126         assert(where);
2127
2128         if (directory)
2129                 p = strappenda(where, directory);
2130         else
2131                 p = where;
2132
2133         errno = 0;
2134         b = blkid_new_probe_from_filename(what);
2135         if (!b) {
2136                 if (errno == 0)
2137                         return log_oom();
2138                 log_error("Failed to allocate prober for %s: %m", what);
2139                 return -errno;
2140         }
2141
2142         blkid_probe_enable_superblocks(b, 1);
2143         blkid_probe_set_superblocks_flags(b, BLKID_SUBLKS_TYPE);
2144
2145         errno = 0;
2146         r = blkid_do_safeprobe(b);
2147         if (r == -1 || r == 1) {
2148                 log_error("Cannot determine file system type of %s", what);
2149                 return -EINVAL;
2150         } else if (r != 0) {
2151                 if (errno == 0)
2152                         errno = EIO;
2153                 log_error("Failed to probe %s: %m", what);
2154                 return -errno;
2155         }
2156
2157         errno = 0;
2158         if (blkid_probe_lookup_value(b, "TYPE", &fstype, NULL) < 0) {
2159                 if (errno == 0)
2160                         errno = EINVAL;
2161                 log_error("Failed to determine file system type of %s", what);
2162                 return -errno;
2163         }
2164
2165         if (streq(fstype, "crypto_LUKS")) {
2166                 log_error("nspawn currently does not support LUKS disk images.");
2167                 return -ENOTSUP;
2168         }
2169
2170         if (mount(what, p, fstype, arg_read_only ? MS_NODEV|MS_RDONLY : 0, NULL) < 0) {
2171                 log_error("Failed to mount %s: %m", what);
2172                 return -errno;
2173         }
2174
2175         return 0;
2176 #else
2177         log_error("--image= is not supported, compiled without blkid support.");
2178         return -ENOTSUP;
2179 #endif
2180 }
2181
2182 static int mount_devices(const char *where, const char *root_device, const char *home_device, const char *srv_device) {
2183         int r;
2184
2185         assert(where);
2186
2187         if (root_device) {
2188                 r = mount_device(root_device, arg_directory, NULL);
2189                 if (r < 0) {
2190                         log_error("Failed to mount root directory: %s", strerror(-r));
2191                         return r;
2192                 }
2193         }
2194
2195         if (home_device) {
2196                 r = mount_device(home_device, arg_directory, "/home");
2197                 if (r < 0) {
2198                         log_error("Failed to mount home directory: %s", strerror(-r));
2199                         return r;
2200                 }
2201         }
2202
2203         if (srv_device) {
2204                 r = mount_device(srv_device, arg_directory, "/srv");
2205                 if (r < 0) {
2206                         log_error("Failed to mount server data directory: %s", strerror(-r));
2207                         return r;
2208                 }
2209         }
2210
2211         return 0;
2212 }
2213
2214 static void loop_remove(int nr, int *image_fd) {
2215         _cleanup_close_ int control = -1;
2216
2217         if (nr < 0)
2218                 return;
2219
2220         if (image_fd && *image_fd >= 0) {
2221                 ioctl(*image_fd, LOOP_CLR_FD);
2222                 close_nointr_nofail(*image_fd);
2223                 *image_fd = -1;
2224         }
2225
2226         control = open("/dev/loop-control", O_RDWR|O_CLOEXEC|O_NOCTTY|O_NONBLOCK);
2227         if (control < 0)
2228                 return;
2229
2230         ioctl(control, LOOP_CTL_REMOVE, nr);
2231 }
2232
2233 static int spawn_getent(const char *database, const char *key, pid_t *rpid) {
2234         int pipe_fds[2];
2235         pid_t pid;
2236
2237         assert(database);
2238         assert(key);
2239         assert(rpid);
2240
2241         if (pipe2(pipe_fds, O_CLOEXEC) < 0) {
2242                 log_error("Failed to allocate pipe: %m");
2243                 return -errno;
2244         }
2245
2246         pid = fork();
2247         if (pid < 0) {
2248                 log_error("Failed to fork getent child: %m");
2249                 return -errno;
2250         } else if (pid == 0) {
2251                 int nullfd;
2252                 char *empty_env = NULL;
2253
2254                 if (dup3(pipe_fds[1], STDOUT_FILENO, 0) < 0)
2255                         _exit(EXIT_FAILURE);
2256
2257                 if (pipe_fds[0] > 2)
2258                         close_nointr_nofail(pipe_fds[0]);
2259                 if (pipe_fds[1] > 2)
2260                         close_nointr_nofail(pipe_fds[1]);
2261
2262                 nullfd = open("/dev/null", O_RDWR);
2263                 if (nullfd < 0)
2264                         _exit(EXIT_FAILURE);
2265
2266                 if (dup3(nullfd, STDIN_FILENO, 0) < 0)
2267                         _exit(EXIT_FAILURE);
2268
2269                 if (dup3(nullfd, STDERR_FILENO, 0) < 0)
2270                         _exit(EXIT_FAILURE);
2271
2272                 if (nullfd > 2)
2273                         close_nointr_nofail(nullfd);
2274
2275                 reset_all_signal_handlers();
2276                 close_all_fds(NULL, 0);
2277
2278                 execle("/usr/bin/getent", "getenv", database, key, NULL, &empty_env);
2279                 execle("/bin/getent", "getenv", database, key, NULL, &empty_env);
2280                 _exit(EXIT_FAILURE);
2281         }
2282
2283         close_nointr_nofail(pipe_fds[1]);
2284         pipe_fds[1] = -1;
2285
2286         *rpid = pid;
2287
2288         return pipe_fds[0];
2289 }
2290
2291 static int change_uid_gid(char **_home) {
2292
2293         _cleanup_strv_free_ char **passwd = NULL;
2294         char line[LINE_MAX], *w, *x, *state, *u, *g, *h;
2295         _cleanup_free_ uid_t *uids = NULL;
2296         _cleanup_free_ char *home = NULL;
2297         _cleanup_fclose_ FILE *f = NULL;
2298         _cleanup_close_ int fd = -1;
2299         unsigned n_uids = 0;
2300         size_t sz, l;
2301         uid_t uid;
2302         gid_t gid;
2303         pid_t pid;
2304         int r;
2305
2306         assert(_home);
2307
2308         if (!arg_user || streq(arg_user, "root") || streq(arg_user, "0")) {
2309                 /* Reset everything fully to 0, just in case */
2310
2311                 if (setgroups(0, NULL) < 0) {
2312                         log_error("setgroups() failed: %m");
2313                         return -errno;
2314                 }
2315
2316                 if (setresgid(0, 0, 0) < 0) {
2317                         log_error("setregid() failed: %m");
2318                         return -errno;
2319                 }
2320
2321                 if (setresuid(0, 0, 0) < 0) {
2322                         log_error("setreuid() failed: %m");
2323                         return -errno;
2324                 }
2325
2326                 *_home = NULL;
2327                 return 0;
2328         }
2329
2330         /* First, get user credentials */
2331         fd = spawn_getent("passwd", arg_user, &pid);
2332         if (fd < 0)
2333                 return fd;
2334
2335         f = fdopen(fd, "r");
2336         if (!f)
2337                 return log_oom();
2338         fd = -1;
2339
2340         if (!fgets(line, sizeof(line), f)) {
2341
2342                 if (!ferror(f)) {
2343                         log_error("Failed to resolve user %s.", arg_user);
2344                         return -ESRCH;
2345                 }
2346
2347                 log_error("Failed to read from getent: %m");
2348                 return -errno;
2349         }
2350
2351         truncate_nl(line);
2352
2353         wait_for_terminate_and_warn("getent passwd", pid);
2354
2355         x = strchr(line, ':');
2356         if (!x) {
2357                 log_error("/etc/passwd entry has invalid user field.");
2358                 return -EIO;
2359         }
2360
2361         u = strchr(x+1, ':');
2362         if (!u) {
2363                 log_error("/etc/passwd entry has invalid password field.");
2364                 return -EIO;
2365         }
2366
2367         u++;
2368         g = strchr(u, ':');
2369         if (!g) {
2370                 log_error("/etc/passwd entry has invalid UID field.");
2371                 return -EIO;
2372         }
2373
2374         *g = 0;
2375         g++;
2376         x = strchr(g, ':');
2377         if (!x) {
2378                 log_error("/etc/passwd entry has invalid GID field.");
2379                 return -EIO;
2380         }
2381
2382         *x = 0;
2383         h = strchr(x+1, ':');
2384         if (!h) {
2385                 log_error("/etc/passwd entry has invalid GECOS field.");
2386                 return -EIO;
2387         }
2388
2389         h++;
2390         x = strchr(h, ':');
2391         if (!x) {
2392                 log_error("/etc/passwd entry has invalid home directory field.");
2393                 return -EIO;
2394         }
2395
2396         *x = 0;
2397
2398         r = parse_uid(u, &uid);
2399         if (r < 0) {
2400                 log_error("Failed to parse UID of user.");
2401                 return -EIO;
2402         }
2403
2404         r = parse_gid(g, &gid);
2405         if (r < 0) {
2406                 log_error("Failed to parse GID of user.");
2407                 return -EIO;
2408         }
2409
2410         home = strdup(h);
2411         if (!home)
2412                 return log_oom();
2413
2414         /* Second, get group memberships */
2415         fd = spawn_getent("initgroups", arg_user, &pid);
2416         if (fd < 0)
2417                 return fd;
2418
2419         fclose(f);
2420         f = fdopen(fd, "r");
2421         if (!f)
2422                 return log_oom();
2423         fd = -1;
2424
2425         if (!fgets(line, sizeof(line), f)) {
2426                 if (!ferror(f)) {
2427                         log_error("Failed to resolve user %s.", arg_user);
2428                         return -ESRCH;
2429                 }
2430
2431                 log_error("Failed to read from getent: %m");
2432                 return -errno;
2433         }
2434
2435         truncate_nl(line);
2436
2437         wait_for_terminate_and_warn("getent initgroups", pid);
2438
2439         /* Skip over the username and subsequent separator whitespace */
2440         x = line;
2441         x += strcspn(x, WHITESPACE);
2442         x += strspn(x, WHITESPACE);
2443
2444         FOREACH_WORD(w, l, x, state) {
2445                 char c[l+1];
2446
2447                 memcpy(c, w, l);
2448                 c[l] = 0;
2449
2450                 if (!GREEDY_REALLOC(uids, sz, n_uids+1))
2451                         return log_oom();
2452
2453                 r = parse_uid(c, &uids[n_uids++]);
2454                 if (r < 0) {
2455                         log_error("Failed to parse group data from getent.");
2456                         return -EIO;
2457                 }
2458         }
2459
2460         r = mkdir_parents(home, 0775);
2461         if (r < 0) {
2462                 log_error("Failed to make home root directory: %s", strerror(-r));
2463                 return r;
2464         }
2465
2466         r = mkdir_safe(home, 0755, uid, gid);
2467         if (r < 0) {
2468                 log_error("Failed to make home directory: %s", strerror(-r));
2469                 return r;
2470         }
2471
2472         fchown(STDIN_FILENO, uid, gid);
2473         fchown(STDOUT_FILENO, uid, gid);
2474         fchown(STDERR_FILENO, uid, gid);
2475
2476         if (setgroups(n_uids, uids) < 0) {
2477                 log_error("Failed to set auxiliary groups: %m");
2478                 return -errno;
2479         }
2480
2481         if (setresgid(gid, gid, gid) < 0) {
2482                 log_error("setregid() failed: %m");
2483                 return -errno;
2484         }
2485
2486         if (setresuid(uid, uid, uid) < 0) {
2487                 log_error("setreuid() failed: %m");
2488                 return -errno;
2489         }
2490
2491         if (_home) {
2492                 *_home = home;
2493                 home = NULL;
2494         }
2495
2496         return 0;
2497 }
2498
2499 int main(int argc, char *argv[]) {
2500
2501         _cleanup_free_ char *kdbus_domain = NULL, *device_path = NULL, *root_device = NULL, *home_device = NULL, *srv_device = NULL;
2502         _cleanup_close_ int master = -1, kdbus_fd = -1, image_fd = -1;
2503         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
2504         _cleanup_fdset_free_ FDSet *fds = NULL;
2505         int r = EXIT_FAILURE, k, n_fd_passed, loop_nr = -1;
2506         const char *console = NULL;
2507         char veth_name[IFNAMSIZ];
2508         bool secondary = false;
2509         pid_t pid = 0;
2510         sigset_t mask;
2511
2512         log_parse_environment();
2513         log_open();
2514
2515         k = parse_argv(argc, argv);
2516         if (k < 0)
2517                 goto finish;
2518         else if (k == 0) {
2519                 r = EXIT_SUCCESS;
2520                 goto finish;
2521         }
2522
2523         if (!arg_image) {
2524                 if (arg_directory) {
2525                         char *p;
2526
2527                         p = path_make_absolute_cwd(arg_directory);
2528                         free(arg_directory);
2529                         arg_directory = p;
2530                 } else
2531                         arg_directory = get_current_dir_name();
2532
2533                 if (!arg_directory) {
2534                         log_error("Failed to determine path, please use -D.");
2535                         goto finish;
2536                 }
2537                 path_kill_slashes(arg_directory);
2538         }
2539
2540         if (!arg_machine) {
2541                 arg_machine = strdup(basename(arg_image ? arg_image : arg_directory));
2542                 if (!arg_machine) {
2543                         log_oom();
2544                         goto finish;
2545                 }
2546
2547                 hostname_cleanup(arg_machine, false);
2548                 if (isempty(arg_machine)) {
2549                         log_error("Failed to determine machine name automatically, please use -M.");
2550                         goto finish;
2551                 }
2552         }
2553
2554         if (geteuid() != 0) {
2555                 log_error("Need to be root.");
2556                 goto finish;
2557         }
2558
2559         if (sd_booted() <= 0) {
2560                 log_error("Not running on a systemd system.");
2561                 goto finish;
2562         }
2563
2564         log_close();
2565         n_fd_passed = sd_listen_fds(false);
2566         if (n_fd_passed > 0) {
2567                 k = fdset_new_listen_fds(&fds, false);
2568                 if (k < 0) {
2569                         log_error("Failed to collect file descriptors: %s", strerror(-k));
2570                         goto finish;
2571                 }
2572         }
2573         fdset_close_others(fds);
2574         log_open();
2575
2576         if (arg_directory) {
2577                 if (path_equal(arg_directory, "/")) {
2578                         log_error("Spawning container on root directory not supported.");
2579                         goto finish;
2580                 }
2581
2582                 if (arg_boot) {
2583                         if (path_is_os_tree(arg_directory) <= 0) {
2584                                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
2585                                 goto finish;
2586                         }
2587                 } else {
2588                         const char *p;
2589
2590                         p = strappenda(arg_directory,
2591                                        argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
2592                         if (access(p, F_OK) < 0) {
2593                                 log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
2594                                 goto finish;
2595
2596                         }
2597                 }
2598         } else {
2599                 char template[] = "/tmp/nspawn-root-XXXXXX";
2600
2601                 if (!mkdtemp(template)) {
2602                         log_error("Failed to create temporary directory: %m");
2603                         r = -errno;
2604                         goto finish;
2605                 }
2606
2607                 arg_directory = strdup(template);
2608                 if (!arg_directory) {
2609                         r = log_oom();
2610                         goto finish;
2611                 }
2612
2613                 image_fd = setup_image(&device_path, &loop_nr);
2614                 if (image_fd < 0) {
2615                         r = image_fd;
2616                         goto finish;
2617                 }
2618
2619                 r = dissect_image(image_fd, &root_device, &home_device, &srv_device, &secondary);
2620                 if (r < 0)
2621                         goto finish;
2622         }
2623
2624         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
2625         if (master < 0) {
2626                 log_error("Failed to acquire pseudo tty: %m");
2627                 goto finish;
2628         }
2629
2630         console = ptsname(master);
2631         if (!console) {
2632                 log_error("Failed to determine tty name: %m");
2633                 goto finish;
2634         }
2635
2636         if (!arg_quiet)
2637                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_image ? arg_image : arg_directory);
2638
2639         if (unlockpt(master) < 0) {
2640                 log_error("Failed to unlock tty: %m");
2641                 goto finish;
2642         }
2643
2644         if (access("/dev/kdbus/control", F_OK) >= 0) {
2645
2646                 if (arg_share_system) {
2647                         kdbus_domain = strdup("/dev/kdbus");
2648                         if (!kdbus_domain) {
2649                                 log_oom();
2650                                 goto finish;
2651                         }
2652                 } else {
2653                         const char *ns;
2654
2655                         ns = strappenda("machine-", arg_machine);
2656                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
2657                         if (r < 0)
2658                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
2659                         else
2660                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
2661                 }
2662         }
2663
2664         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
2665                 log_error("Failed to create kmsg socket pair: %m");
2666                 goto finish;
2667         }
2668
2669         sd_notify(0, "READY=1");
2670
2671         assert_se(sigemptyset(&mask) == 0);
2672         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
2673         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
2674
2675         for (;;) {
2676                 int parent_ready_fd = -1, child_ready_fd = -1;
2677                 siginfo_t status;
2678                 eventfd_t x;
2679
2680                 parent_ready_fd = eventfd(0, EFD_CLOEXEC);
2681                 if (parent_ready_fd < 0) {
2682                         log_error("Failed to create event fd: %m");
2683                         goto finish;
2684                 }
2685
2686                 child_ready_fd = eventfd(0, EFD_CLOEXEC);
2687                 if (child_ready_fd < 0) {
2688                         log_error("Failed to create event fd: %m");
2689                         goto finish;
2690                 }
2691
2692                 pid = syscall(__NR_clone,
2693                               SIGCHLD|CLONE_NEWNS|
2694                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
2695                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
2696                 if (pid < 0) {
2697                         if (errno == EINVAL)
2698                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
2699                         else
2700                                 log_error("clone() failed: %m");
2701
2702                         goto finish;
2703                 }
2704
2705                 if (pid == 0) {
2706                         /* child */
2707                         _cleanup_free_ char *home = NULL;
2708                         unsigned n_env = 2;
2709                         const char *envp[] = {
2710                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
2711                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
2712                                 NULL, /* TERM */
2713                                 NULL, /* HOME */
2714                                 NULL, /* USER */
2715                                 NULL, /* LOGNAME */
2716                                 NULL, /* container_uuid */
2717                                 NULL, /* LISTEN_FDS */
2718                                 NULL, /* LISTEN_PID */
2719                                 NULL
2720                         };
2721                         char **env_use;
2722
2723                         envp[n_env] = strv_find_prefix(environ, "TERM=");
2724                         if (envp[n_env])
2725                                 n_env ++;
2726
2727                         close_nointr_nofail(master);
2728                         master = -1;
2729
2730                         close_nointr(STDIN_FILENO);
2731                         close_nointr(STDOUT_FILENO);
2732                         close_nointr(STDERR_FILENO);
2733
2734                         close_nointr_nofail(kmsg_socket_pair[0]);
2735                         kmsg_socket_pair[0] = -1;
2736
2737                         reset_all_signal_handlers();
2738
2739                         assert_se(sigemptyset(&mask) == 0);
2740                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
2741
2742                         k = open_terminal(console, O_RDWR);
2743                         if (k != STDIN_FILENO) {
2744                                 if (k >= 0) {
2745                                         close_nointr_nofail(k);
2746                                         k = -EINVAL;
2747                                 }
2748
2749                                 log_error("Failed to open console: %s", strerror(-k));
2750                                 goto child_fail;
2751                         }
2752
2753                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2754                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2755                                 log_error("Failed to duplicate console: %m");
2756                                 goto child_fail;
2757                         }
2758
2759                         if (setsid() < 0) {
2760                                 log_error("setsid() failed: %m");
2761                                 goto child_fail;
2762                         }
2763
2764                         if (reset_audit_loginuid() < 0)
2765                                 goto child_fail;
2766
2767                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2768                                 log_error("PR_SET_PDEATHSIG failed: %m");
2769                                 goto child_fail;
2770                         }
2771
2772                         /* Mark everything as slave, so that we still
2773                          * receive mounts from the real root, but don't
2774                          * propagate mounts to the real root. */
2775                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2776                                 log_error("MS_SLAVE|MS_REC failed: %m");
2777                                 goto child_fail;
2778                         }
2779
2780                         if (mount_devices(arg_directory, root_device, home_device, srv_device) < 0)
2781                                 goto child_fail;
2782
2783                         /* Turn directory into bind mount */
2784                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2785                                 log_error("Failed to make bind mount.");
2786                                 goto child_fail;
2787                         }
2788
2789                         if (arg_read_only)
2790                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2791                                         log_error("Failed to make read-only.");
2792                                         goto child_fail;
2793                                 }
2794
2795                         if (mount_all(arg_directory) < 0)
2796                                 goto child_fail;
2797
2798                         if (copy_devnodes(arg_directory) < 0)
2799                                 goto child_fail;
2800
2801                         if (setup_ptmx(arg_directory) < 0)
2802                                 goto child_fail;
2803
2804                         dev_setup(arg_directory);
2805
2806                         if (audit_still_doesnt_work_in_containers() < 0)
2807                                 goto child_fail;
2808
2809                         if (setup_dev_console(arg_directory, console) < 0)
2810                                 goto child_fail;
2811
2812                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2813                                 goto child_fail;
2814
2815                         close_nointr_nofail(kmsg_socket_pair[1]);
2816                         kmsg_socket_pair[1] = -1;
2817
2818                         if (setup_boot_id(arg_directory) < 0)
2819                                 goto child_fail;
2820
2821                         if (setup_timezone(arg_directory) < 0)
2822                                 goto child_fail;
2823
2824                         if (setup_resolv_conf(arg_directory) < 0)
2825                                 goto child_fail;
2826
2827                         if (setup_journal(arg_directory) < 0)
2828                                 goto child_fail;
2829
2830                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2831                                 goto child_fail;
2832
2833                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2834                                 goto child_fail;
2835
2836                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2837                                 goto child_fail;
2838
2839                         /* Tell the parent that we are ready, and that
2840                          * it can cgroupify us to that we lack access
2841                          * to certain devices and resources. */
2842                         eventfd_write(child_ready_fd, 1);
2843                         close_nointr_nofail(child_ready_fd);
2844                         child_ready_fd = -1;
2845
2846                         if (chdir(arg_directory) < 0) {
2847                                 log_error("chdir(%s) failed: %m", arg_directory);
2848                                 goto child_fail;
2849                         }
2850
2851                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2852                                 log_error("mount(MS_MOVE) failed: %m");
2853                                 goto child_fail;
2854                         }
2855
2856                         if (chroot(".") < 0) {
2857                                 log_error("chroot() failed: %m");
2858                                 goto child_fail;
2859                         }
2860
2861                         if (chdir("/") < 0) {
2862                                 log_error("chdir() failed: %m");
2863                                 goto child_fail;
2864                         }
2865
2866                         umask(0022);
2867
2868                         if (arg_private_network)
2869                                 loopback_setup();
2870
2871                         if (drop_capabilities() < 0) {
2872                                 log_error("drop_capabilities() failed: %m");
2873                                 goto child_fail;
2874                         }
2875
2876                         r = change_uid_gid(&home);
2877                         if (r < 0)
2878                                 goto child_fail;
2879
2880                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2881                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2882                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2883                                 log_oom();
2884                                 goto child_fail;
2885                         }
2886
2887                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2888                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2889                                         log_oom();
2890                                         goto child_fail;
2891                                 }
2892                         }
2893
2894                         if (fdset_size(fds) > 0) {
2895                                 k = fdset_cloexec(fds, false);
2896                                 if (k < 0) {
2897                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2898                                         goto child_fail;
2899                                 }
2900
2901                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2902                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2903                                         log_oom();
2904                                         goto child_fail;
2905                                 }
2906                         }
2907
2908                         setup_hostname();
2909
2910                         if (arg_personality != 0xffffffffLU) {
2911                                 if (personality(arg_personality) < 0) {
2912                                         log_error("personality() failed: %m");
2913                                         goto child_fail;
2914                                 }
2915                         } else if (secondary) {
2916                                 if (personality(PER_LINUX32) < 0) {
2917                                         log_error("personality() failed: %m");
2918                                         goto child_fail;
2919                                 }
2920                         }
2921
2922 #ifdef HAVE_SELINUX
2923                         if (arg_selinux_context)
2924                                 if (setexeccon((security_context_t) arg_selinux_context) < 0) {
2925                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2926                                         goto child_fail;
2927                                 }
2928 #endif
2929
2930                         if (!strv_isempty(arg_setenv)) {
2931                                 char **n;
2932
2933                                 n = strv_env_merge(2, envp, arg_setenv);
2934                                 if (!n) {
2935                                         log_oom();
2936                                         goto child_fail;
2937                                 }
2938
2939                                 env_use = n;
2940                         } else
2941                                 env_use = (char**) envp;
2942
2943                         /* Wait until the parent is ready with the setup, too... */
2944                         eventfd_read(parent_ready_fd, &x);
2945                         close_nointr_nofail(parent_ready_fd);
2946                         parent_ready_fd = -1;
2947
2948                         if (arg_boot) {
2949                                 char **a;
2950                                 size_t l;
2951
2952                                 /* Automatically search for the init system */
2953
2954                                 l = 1 + argc - optind;
2955                                 a = newa(char*, l + 1);
2956                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
2957
2958                                 a[0] = (char*) "/usr/lib/systemd/systemd";
2959                                 execve(a[0], a, env_use);
2960
2961                                 a[0] = (char*) "/lib/systemd/systemd";
2962                                 execve(a[0], a, env_use);
2963
2964                                 a[0] = (char*) "/sbin/init";
2965                                 execve(a[0], a, env_use);
2966                         } else if (argc > optind)
2967                                 execvpe(argv[optind], argv + optind, env_use);
2968                         else {
2969                                 chdir(home ? home : "/root");
2970                                 execle("/bin/bash", "-bash", NULL, env_use);
2971                                 execle("/bin/sh", "-sh", NULL, env_use);
2972                         }
2973
2974                         log_error("execv() failed: %m");
2975
2976                 child_fail:
2977                         _exit(EXIT_FAILURE);
2978                 }
2979
2980                 fdset_free(fds);
2981                 fds = NULL;
2982
2983                 /* Wait until the child reported that it is ready with
2984                  * all it needs to do with priviliges. After we got
2985                  * the notification we can make the process join its
2986                  * cgroup which might limit what it can do */
2987                 eventfd_read(child_ready_fd, &x);
2988
2989                 r = register_machine(pid);
2990                 if (r < 0)
2991                         goto finish;
2992
2993                 r = move_network_interfaces(pid);
2994                 if (r < 0)
2995                         goto finish;
2996
2997                 r = setup_veth(pid, veth_name);
2998                 if (r < 0)
2999                         goto finish;
3000
3001                 r = setup_bridge(veth_name);
3002                 if (r < 0)
3003                         goto finish;
3004
3005                 r = setup_macvlan(pid);
3006                 if (r < 0)
3007                         goto finish;
3008
3009                 /* Notify the child that the parent is ready with all
3010                  * its setup, and thtat the child can now hand over
3011                  * control to the code to run inside the container. */
3012                 eventfd_write(parent_ready_fd, 1);
3013
3014                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
3015                 if (k < 0) {
3016                         r = EXIT_FAILURE;
3017                         break;
3018                 }
3019
3020                 if (!arg_quiet)
3021                         putc('\n', stdout);
3022
3023                 /* Kill if it is not dead yet anyway */
3024                 terminate_machine(pid);
3025
3026                 /* Redundant, but better safe than sorry */
3027                 kill(pid, SIGKILL);
3028
3029                 k = wait_for_terminate(pid, &status);
3030                 pid = 0;
3031
3032                 if (k < 0) {
3033                         r = EXIT_FAILURE;
3034                         break;
3035                 }
3036
3037                 if (status.si_code == CLD_EXITED) {
3038                         r = status.si_status;
3039                         if (status.si_status != 0) {
3040                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
3041                                 break;
3042                         }
3043
3044                         if (!arg_quiet)
3045                                 log_debug("Container %s exited successfully.", arg_machine);
3046                         break;
3047                 } else if (status.si_code == CLD_KILLED &&
3048                            status.si_status == SIGINT) {
3049
3050                         if (!arg_quiet)
3051                                 log_info("Container %s has been shut down.", arg_machine);
3052                         r = 0;
3053                         break;
3054                 } else if (status.si_code == CLD_KILLED &&
3055                            status.si_status == SIGHUP) {
3056
3057                         if (!arg_quiet)
3058                                 log_info("Container %s is being rebooted.", arg_machine);
3059                         continue;
3060                 } else if (status.si_code == CLD_KILLED ||
3061                            status.si_code == CLD_DUMPED) {
3062
3063                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
3064                         r = EXIT_FAILURE;
3065                         break;
3066                 } else {
3067                         log_error("Container %s failed due to unknown reason.", arg_machine);
3068                         r = EXIT_FAILURE;
3069                         break;
3070                 }
3071         }
3072
3073 finish:
3074         loop_remove(loop_nr, &image_fd);
3075
3076         if (pid > 0)
3077                 kill(pid, SIGKILL);
3078
3079         free(arg_directory);
3080         free(arg_machine);
3081         free(arg_user);
3082         strv_free(arg_setenv);
3083         strv_free(arg_network_interfaces);
3084         strv_free(arg_network_macvlan);
3085         strv_free(arg_bind);
3086         strv_free(arg_bind_ro);
3087
3088         return r;
3089 }