chiark / gitweb /
nspawn: fix detection of missing /proc/self/loginuid
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #include <net/if.h>
45 #include <linux/veth.h>
46 #include <sys/personality.h>
47
48 #ifdef HAVE_SELINUX
49 #include <selinux/selinux.h>
50 #endif
51
52 #ifdef HAVE_SECCOMP
53 #include <seccomp.h>
54 #endif
55
56 #include "sd-daemon.h"
57 #include "sd-bus.h"
58 #include "sd-id128.h"
59 #include "sd-rtnl.h"
60 #include "log.h"
61 #include "util.h"
62 #include "mkdir.h"
63 #include "macro.h"
64 #include "audit.h"
65 #include "missing.h"
66 #include "cgroup-util.h"
67 #include "strv.h"
68 #include "path-util.h"
69 #include "loopback-setup.h"
70 #include "dev-setup.h"
71 #include "fdset.h"
72 #include "build.h"
73 #include "fileio.h"
74 #include "bus-util.h"
75 #include "bus-error.h"
76 #include "ptyfwd.h"
77 #include "bus-kernel.h"
78 #include "env-util.h"
79 #include "def.h"
80 #include "rtnl-util.h"
81 #include "udev-util.h"
82
83 #ifdef HAVE_SECCOMP
84 #include "seccomp-util.h"
85 #endif
86
87 typedef enum LinkJournal {
88         LINK_NO,
89         LINK_AUTO,
90         LINK_HOST,
91         LINK_GUEST
92 } LinkJournal;
93
94 static char *arg_directory = NULL;
95 static char *arg_user = NULL;
96 static sd_id128_t arg_uuid = {};
97 static char *arg_machine = NULL;
98 static const char *arg_selinux_context = NULL;
99 static const char *arg_selinux_apifs_context = NULL;
100 static const char *arg_slice = NULL;
101 static bool arg_private_network = false;
102 static bool arg_read_only = false;
103 static bool arg_boot = false;
104 static LinkJournal arg_link_journal = LINK_AUTO;
105 static uint64_t arg_retain =
106         (1ULL << CAP_CHOWN) |
107         (1ULL << CAP_DAC_OVERRIDE) |
108         (1ULL << CAP_DAC_READ_SEARCH) |
109         (1ULL << CAP_FOWNER) |
110         (1ULL << CAP_FSETID) |
111         (1ULL << CAP_IPC_OWNER) |
112         (1ULL << CAP_KILL) |
113         (1ULL << CAP_LEASE) |
114         (1ULL << CAP_LINUX_IMMUTABLE) |
115         (1ULL << CAP_NET_BIND_SERVICE) |
116         (1ULL << CAP_NET_BROADCAST) |
117         (1ULL << CAP_NET_RAW) |
118         (1ULL << CAP_SETGID) |
119         (1ULL << CAP_SETFCAP) |
120         (1ULL << CAP_SETPCAP) |
121         (1ULL << CAP_SETUID) |
122         (1ULL << CAP_SYS_ADMIN) |
123         (1ULL << CAP_SYS_CHROOT) |
124         (1ULL << CAP_SYS_NICE) |
125         (1ULL << CAP_SYS_PTRACE) |
126         (1ULL << CAP_SYS_TTY_CONFIG) |
127         (1ULL << CAP_SYS_RESOURCE) |
128         (1ULL << CAP_SYS_BOOT) |
129         (1ULL << CAP_AUDIT_WRITE) |
130         (1ULL << CAP_AUDIT_CONTROL) |
131         (1ULL << CAP_MKNOD);
132 static char **arg_bind = NULL;
133 static char **arg_bind_ro = NULL;
134 static char **arg_setenv = NULL;
135 static bool arg_quiet = false;
136 static bool arg_share_system = false;
137 static bool arg_register = true;
138 static bool arg_keep_unit = false;
139 static char **arg_network_interfaces = NULL;
140 static char **arg_network_macvlan = NULL;
141 static bool arg_network_veth = false;
142 static const char *arg_network_bridge = NULL;
143 static unsigned long arg_personality = 0xffffffffLU;
144
145 static int help(void) {
146
147         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
148                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
149                "  -h --help                 Show this help\n"
150                "     --version              Print version string\n"
151                "  -q --quiet                Do not show status information\n"
152                "  -D --directory=NAME       Root directory for the container\n"
153                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
154                "  -u --user=USER            Run the command under specified user or uid\n"
155                "  -M --machine=NAME         Set the machine name for the container\n"
156                "     --uuid=UUID            Set a specific machine UUID for the container\n"
157                "  -S --slice=SLICE          Place the container in the specified slice\n"
158                "     --private-network      Disable network in container\n"
159                "     --network-interface=INTERFACE\n"
160                "                            Assign an existing network interface to the\n"
161                "                            container\n"
162                "     --network-macvlan=INTERFACE\n"
163                "                            Create a macvlan network interface based on an\n"
164                "                            existing network interface to the container\n"
165                "     --network-veth         Add a virtual ethernet connection between host\n"
166                "                            and container\n"
167                "     --network-bridge=INTERFACE\n"
168                "                            Add a virtual ethernet connection between host\n"
169                "                            and container and add it to an existing bridge on\n"
170                "                            the host\n"
171                "  -Z --selinux-context=SECLABEL\n"
172                "                            Set the SELinux security context to be used by\n"
173                "                            processes in the container\n"
174                "  -L --selinux-apifs-context=SECLABEL\n"
175                "                            Set the SELinux security context to be used by\n"
176                "                            API/tmpfs file systems in the container\n"
177                "     --capability=CAP       In addition to the default, retain specified\n"
178                "                            capability\n"
179                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
180                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
181                "  -j                        Equivalent to --link-journal=host\n"
182                "     --read-only            Mount the root directory read-only\n"
183                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
184                "                            the container\n"
185                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
186                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n"
187                "     --share-system         Share system namespaces with host\n"
188                "     --register=BOOLEAN     Register container as machine\n"
189                "     --keep-unit            Do not register a scope for the machine, reuse\n"
190                "                            the service unit nspawn is running in\n",
191                program_invocation_short_name);
192
193         return 0;
194 }
195
196 static int parse_argv(int argc, char *argv[]) {
197
198         enum {
199                 ARG_VERSION = 0x100,
200                 ARG_PRIVATE_NETWORK,
201                 ARG_UUID,
202                 ARG_READ_ONLY,
203                 ARG_CAPABILITY,
204                 ARG_DROP_CAPABILITY,
205                 ARG_LINK_JOURNAL,
206                 ARG_BIND,
207                 ARG_BIND_RO,
208                 ARG_SETENV,
209                 ARG_SHARE_SYSTEM,
210                 ARG_REGISTER,
211                 ARG_KEEP_UNIT,
212                 ARG_NETWORK_INTERFACE,
213                 ARG_NETWORK_MACVLAN,
214                 ARG_NETWORK_VETH,
215                 ARG_NETWORK_BRIDGE,
216                 ARG_PERSONALITY,
217         };
218
219         static const struct option options[] = {
220                 { "help",                  no_argument,       NULL, 'h'                   },
221                 { "version",               no_argument,       NULL, ARG_VERSION           },
222                 { "directory",             required_argument, NULL, 'D'                   },
223                 { "user",                  required_argument, NULL, 'u'                   },
224                 { "private-network",       no_argument,       NULL, ARG_PRIVATE_NETWORK   },
225                 { "boot",                  no_argument,       NULL, 'b'                   },
226                 { "uuid",                  required_argument, NULL, ARG_UUID              },
227                 { "read-only",             no_argument,       NULL, ARG_READ_ONLY         },
228                 { "capability",            required_argument, NULL, ARG_CAPABILITY        },
229                 { "drop-capability",       required_argument, NULL, ARG_DROP_CAPABILITY   },
230                 { "link-journal",          required_argument, NULL, ARG_LINK_JOURNAL      },
231                 { "bind",                  required_argument, NULL, ARG_BIND              },
232                 { "bind-ro",               required_argument, NULL, ARG_BIND_RO           },
233                 { "machine",               required_argument, NULL, 'M'                   },
234                 { "slice",                 required_argument, NULL, 'S'                   },
235                 { "setenv",                required_argument, NULL, ARG_SETENV            },
236                 { "selinux-context",       required_argument, NULL, 'Z'                   },
237                 { "selinux-apifs-context", required_argument, NULL, 'L'                   },
238                 { "quiet",                 no_argument,       NULL, 'q'                   },
239                 { "share-system",          no_argument,       NULL, ARG_SHARE_SYSTEM      },
240                 { "register",              required_argument, NULL, ARG_REGISTER          },
241                 { "keep-unit",             no_argument,       NULL, ARG_KEEP_UNIT         },
242                 { "network-interface",     required_argument, NULL, ARG_NETWORK_INTERFACE },
243                 { "network-macvlan",       required_argument, NULL, ARG_NETWORK_MACVLAN   },
244                 { "network-veth",          no_argument,       NULL, ARG_NETWORK_VETH      },
245                 { "network-bridge",        required_argument, NULL, ARG_NETWORK_BRIDGE    },
246                 { "personality",           required_argument, NULL, ARG_PERSONALITY       },
247                 {}
248         };
249
250         int c, r;
251         uint64_t plus = 0, minus = 0;
252
253         assert(argc >= 0);
254         assert(argv);
255
256         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:q", options, NULL)) >= 0) {
257
258                 switch (c) {
259
260                 case 'h':
261                         return help();
262
263                 case ARG_VERSION:
264                         puts(PACKAGE_STRING);
265                         puts(SYSTEMD_FEATURES);
266                         return 0;
267
268                 case 'D':
269                         free(arg_directory);
270                         arg_directory = canonicalize_file_name(optarg);
271                         if (!arg_directory) {
272                                 log_error("Invalid root directory: %m");
273                                 return -ENOMEM;
274                         }
275
276                         break;
277
278                 case 'u':
279                         free(arg_user);
280                         arg_user = strdup(optarg);
281                         if (!arg_user)
282                                 return log_oom();
283
284                         break;
285
286                 case ARG_NETWORK_BRIDGE:
287                         arg_network_bridge = optarg;
288
289                         /* fall through */
290
291                 case ARG_NETWORK_VETH:
292                         arg_network_veth = true;
293                         arg_private_network = true;
294                         break;
295
296                 case ARG_NETWORK_INTERFACE:
297                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
298                                 return log_oom();
299
300                         arg_private_network = true;
301                         break;
302
303                 case ARG_NETWORK_MACVLAN:
304                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
305                                 return log_oom();
306
307                         /* fall through */
308
309                 case ARG_PRIVATE_NETWORK:
310                         arg_private_network = true;
311                         break;
312
313                 case 'b':
314                         arg_boot = true;
315                         break;
316
317                 case ARG_UUID:
318                         r = sd_id128_from_string(optarg, &arg_uuid);
319                         if (r < 0) {
320                                 log_error("Invalid UUID: %s", optarg);
321                                 return r;
322                         }
323                         break;
324
325                 case 'S':
326                         arg_slice = optarg;
327                         break;
328
329                 case 'M':
330                         if (isempty(optarg)) {
331                                 free(arg_machine);
332                                 arg_machine = NULL;
333                         } else {
334
335                                 if (!hostname_is_valid(optarg)) {
336                                         log_error("Invalid machine name: %s", optarg);
337                                         return -EINVAL;
338                                 }
339
340                                 free(arg_machine);
341                                 arg_machine = strdup(optarg);
342                                 if (!arg_machine)
343                                         return log_oom();
344
345                                 break;
346                         }
347
348                 case 'Z':
349                         arg_selinux_context = optarg;
350                         break;
351
352                 case 'L':
353                         arg_selinux_apifs_context = optarg;
354                         break;
355
356                 case ARG_READ_ONLY:
357                         arg_read_only = true;
358                         break;
359
360                 case ARG_CAPABILITY:
361                 case ARG_DROP_CAPABILITY: {
362                         char *state, *word;
363                         size_t length;
364
365                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
366                                 _cleanup_free_ char *t;
367                                 cap_value_t cap;
368
369                                 t = strndup(word, length);
370                                 if (!t)
371                                         return log_oom();
372
373                                 if (streq(t, "all")) {
374                                         if (c == ARG_CAPABILITY)
375                                                 plus = (uint64_t) -1;
376                                         else
377                                                 minus = (uint64_t) -1;
378                                 } else {
379                                         if (cap_from_name(t, &cap) < 0) {
380                                                 log_error("Failed to parse capability %s.", t);
381                                                 return -EINVAL;
382                                         }
383
384                                         if (c == ARG_CAPABILITY)
385                                                 plus |= 1ULL << (uint64_t) cap;
386                                         else
387                                                 minus |= 1ULL << (uint64_t) cap;
388                                 }
389                         }
390
391                         break;
392                 }
393
394                 case 'j':
395                         arg_link_journal = LINK_GUEST;
396                         break;
397
398                 case ARG_LINK_JOURNAL:
399                         if (streq(optarg, "auto"))
400                                 arg_link_journal = LINK_AUTO;
401                         else if (streq(optarg, "no"))
402                                 arg_link_journal = LINK_NO;
403                         else if (streq(optarg, "guest"))
404                                 arg_link_journal = LINK_GUEST;
405                         else if (streq(optarg, "host"))
406                                 arg_link_journal = LINK_HOST;
407                         else {
408                                 log_error("Failed to parse link journal mode %s", optarg);
409                                 return -EINVAL;
410                         }
411
412                         break;
413
414                 case ARG_BIND:
415                 case ARG_BIND_RO: {
416                         _cleanup_free_ char *a = NULL, *b = NULL;
417                         char *e;
418                         char ***x;
419
420                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
421
422                         e = strchr(optarg, ':');
423                         if (e) {
424                                 a = strndup(optarg, e - optarg);
425                                 b = strdup(e + 1);
426                         } else {
427                                 a = strdup(optarg);
428                                 b = strdup(optarg);
429                         }
430
431                         if (!a || !b)
432                                 return log_oom();
433
434                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
435                                 log_error("Invalid bind mount specification: %s", optarg);
436                                 return -EINVAL;
437                         }
438
439                         r = strv_extend(x, a);
440                         if (r < 0)
441                                 return log_oom();
442
443                         r = strv_extend(x, b);
444                         if (r < 0)
445                                 return log_oom();
446
447                         break;
448                 }
449
450                 case ARG_SETENV: {
451                         char **n;
452
453                         if (!env_assignment_is_valid(optarg)) {
454                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
455                                 return -EINVAL;
456                         }
457
458                         n = strv_env_set(arg_setenv, optarg);
459                         if (!n)
460                                 return log_oom();
461
462                         strv_free(arg_setenv);
463                         arg_setenv = n;
464                         break;
465                 }
466
467                 case 'q':
468                         arg_quiet = true;
469                         break;
470
471                 case ARG_SHARE_SYSTEM:
472                         arg_share_system = true;
473                         break;
474
475                 case ARG_REGISTER:
476                         r = parse_boolean(optarg);
477                         if (r < 0) {
478                                 log_error("Failed to parse --register= argument: %s", optarg);
479                                 return r;
480                         }
481
482                         arg_register = r;
483                         break;
484
485                 case ARG_KEEP_UNIT:
486                         arg_keep_unit = true;
487                         break;
488
489                 case ARG_PERSONALITY:
490
491                         arg_personality = personality_from_string(optarg);
492                         if (arg_personality == 0xffffffffLU) {
493                                 log_error("Unknown or unsupported personality '%s'.", optarg);
494                                 return -EINVAL;
495                         }
496
497                         break;
498
499                 case '?':
500                         return -EINVAL;
501
502                 default:
503                         assert_not_reached("Unhandled option");
504                 }
505         }
506
507         if (arg_share_system)
508                 arg_register = false;
509
510         if (arg_boot && arg_share_system) {
511                 log_error("--boot and --share-system may not be combined.");
512                 return -EINVAL;
513         }
514
515         if (arg_keep_unit && cg_pid_get_owner_uid(0, NULL) >= 0) {
516                 log_error("--keep-unit may not be used when invoked from a user session.");
517                 return -EINVAL;
518         }
519
520         arg_retain = (arg_retain | plus | (arg_private_network ? 1ULL << CAP_NET_ADMIN : 0)) & ~minus;
521
522         return 1;
523 }
524
525 static int mount_all(const char *dest) {
526
527         typedef struct MountPoint {
528                 const char *what;
529                 const char *where;
530                 const char *type;
531                 const char *options;
532                 unsigned long flags;
533                 bool fatal;
534         } MountPoint;
535
536         static const MountPoint mount_table[] = {
537                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
538                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
539                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
540                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
541                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
542                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
543                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
544                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
545 #ifdef HAVE_SELINUX
546                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
547                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
548 #endif
549         };
550
551         unsigned k;
552         int r = 0;
553
554         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
555                 _cleanup_free_ char *where = NULL;
556 #ifdef HAVE_SELINUX
557                 _cleanup_free_ char *options = NULL;
558 #endif
559                 const char *o;
560                 int t;
561
562                 where = strjoin(dest, "/", mount_table[k].where, NULL);
563                 if (!where)
564                         return log_oom();
565
566                 t = path_is_mount_point(where, true);
567                 if (t < 0) {
568                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
569
570                         if (r == 0)
571                                 r = t;
572
573                         continue;
574                 }
575
576                 /* Skip this entry if it is not a remount. */
577                 if (mount_table[k].what && t > 0)
578                         continue;
579
580                 mkdir_p(where, 0755);
581
582 #ifdef HAVE_SELINUX
583                 if (arg_selinux_apifs_context &&
584                     (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
585                         options = strjoin(mount_table[k].options, ",context=\"", arg_selinux_apifs_context, "\"", NULL);
586                         if (!options)
587                                 return log_oom();
588
589                         o = options;
590                 } else
591 #endif
592                         o = mount_table[k].options;
593
594
595                 if (mount(mount_table[k].what,
596                           where,
597                           mount_table[k].type,
598                           mount_table[k].flags,
599                           o) < 0 &&
600                     mount_table[k].fatal) {
601
602                         log_error("mount(%s) failed: %m", where);
603
604                         if (r == 0)
605                                 r = -errno;
606                 }
607         }
608
609         return r;
610 }
611
612 static int mount_binds(const char *dest, char **l, unsigned long flags) {
613         char **x, **y;
614
615         STRV_FOREACH_PAIR(x, y, l) {
616                 char *where;
617                 struct stat source_st, dest_st;
618                 int r;
619
620                 if (stat(*x, &source_st) < 0) {
621                         log_error("failed to stat %s: %m", *x);
622                         return -errno;
623                 }
624
625                 where = strappenda(dest, *y);
626                 r = stat(where, &dest_st);
627                 if (r == 0) {
628                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
629                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
630                                                 *x, where);
631                                 return -EINVAL;
632                         }
633                 } else if (errno == ENOENT) {
634                         r = mkdir_parents_label(where, 0755);
635                         if (r < 0) {
636                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
637                                 return r;
638                         }
639                 } else {
640                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
641                         return -errno;
642                 }
643                 /* Create the mount point, but be conservative -- refuse to create block
644                 * and char devices. */
645                 if (S_ISDIR(source_st.st_mode))
646                         mkdir_label(where, 0755);
647                 else if (S_ISFIFO(source_st.st_mode))
648                         mkfifo(where, 0644);
649                 else if (S_ISSOCK(source_st.st_mode))
650                         mknod(where, 0644 | S_IFSOCK, 0);
651                 else if (S_ISREG(source_st.st_mode))
652                         touch(where);
653                 else {
654                         log_error("Refusing to create mountpoint for file: %s", *x);
655                         return -ENOTSUP;
656                 }
657
658                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
659                         log_error("mount(%s) failed: %m", where);
660                         return -errno;
661                 }
662
663                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
664                         log_error("mount(%s) failed: %m", where);
665                         return -errno;
666                 }
667         }
668
669         return 0;
670 }
671
672 static int setup_timezone(const char *dest) {
673         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
674         char *z, *y;
675         int r;
676
677         assert(dest);
678
679         /* Fix the timezone, if possible */
680         r = readlink_malloc("/etc/localtime", &p);
681         if (r < 0) {
682                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
683                 return 0;
684         }
685
686         z = path_startswith(p, "../usr/share/zoneinfo/");
687         if (!z)
688                 z = path_startswith(p, "/usr/share/zoneinfo/");
689         if (!z) {
690                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
691                 return 0;
692         }
693
694         where = strappend(dest, "/etc/localtime");
695         if (!where)
696                 return log_oom();
697
698         r = readlink_malloc(where, &q);
699         if (r >= 0) {
700                 y = path_startswith(q, "../usr/share/zoneinfo/");
701                 if (!y)
702                         y = path_startswith(q, "/usr/share/zoneinfo/");
703
704
705                 /* Already pointing to the right place? Then do nothing .. */
706                 if (y && streq(y, z))
707                         return 0;
708         }
709
710         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
711         if (!check)
712                 return log_oom();
713
714         if (access(check, F_OK) < 0) {
715                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
716                 return 0;
717         }
718
719         what = strappend("../usr/share/zoneinfo/", z);
720         if (!what)
721                 return log_oom();
722
723         unlink(where);
724         if (symlink(what, where) < 0) {
725                 log_error("Failed to correct timezone of container: %m");
726                 return 0;
727         }
728
729         return 0;
730 }
731
732 static int setup_resolv_conf(const char *dest) {
733         char _cleanup_free_ *where = NULL;
734
735         assert(dest);
736
737         if (arg_private_network)
738                 return 0;
739
740         /* Fix resolv.conf, if possible */
741         where = strappend(dest, "/etc/resolv.conf");
742         if (!where)
743                 return log_oom();
744
745         /* We don't really care for the results of this really. If it
746          * fails, it fails, but meh... */
747         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
748
749         return 0;
750 }
751
752 static int setup_boot_id(const char *dest) {
753         _cleanup_free_ char *from = NULL, *to = NULL;
754         sd_id128_t rnd = {};
755         char as_uuid[37];
756         int r;
757
758         assert(dest);
759
760         if (arg_share_system)
761                 return 0;
762
763         /* Generate a new randomized boot ID, so that each boot-up of
764          * the container gets a new one */
765
766         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
767         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
768         if (!from || !to)
769                 return log_oom();
770
771         r = sd_id128_randomize(&rnd);
772         if (r < 0) {
773                 log_error("Failed to generate random boot id: %s", strerror(-r));
774                 return r;
775         }
776
777         snprintf(as_uuid, sizeof(as_uuid),
778                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
779                  SD_ID128_FORMAT_VAL(rnd));
780         char_array_0(as_uuid);
781
782         r = write_string_file(from, as_uuid);
783         if (r < 0) {
784                 log_error("Failed to write boot id: %s", strerror(-r));
785                 return r;
786         }
787
788         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
789                 log_error("Failed to bind mount boot id: %m");
790                 r = -errno;
791         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
792                 log_warning("Failed to make boot id read-only: %m");
793
794         unlink(from);
795         return r;
796 }
797
798 static int copy_devnodes(const char *dest) {
799
800         static const char devnodes[] =
801                 "null\0"
802                 "zero\0"
803                 "full\0"
804                 "random\0"
805                 "urandom\0"
806                 "tty\0";
807
808         const char *d;
809         int r = 0;
810         _cleanup_umask_ mode_t u;
811
812         assert(dest);
813
814         u = umask(0000);
815
816         NULSTR_FOREACH(d, devnodes) {
817                 _cleanup_free_ char *from = NULL, *to = NULL;
818                 struct stat st;
819
820                 from = strappend("/dev/", d);
821                 to = strjoin(dest, "/dev/", d, NULL);
822                 if (!from || !to)
823                         return log_oom();
824
825                 if (stat(from, &st) < 0) {
826
827                         if (errno != ENOENT) {
828                                 log_error("Failed to stat %s: %m", from);
829                                 return -errno;
830                         }
831
832                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
833
834                         log_error("%s is not a char or block device, cannot copy", from);
835                         return -EIO;
836
837                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
838
839                         log_error("mknod(%s) failed: %m", dest);
840                         return  -errno;
841                 }
842         }
843
844         return r;
845 }
846
847 static int setup_ptmx(const char *dest) {
848         _cleanup_free_ char *p = NULL;
849
850         p = strappend(dest, "/dev/ptmx");
851         if (!p)
852                 return log_oom();
853
854         if (symlink("pts/ptmx", p) < 0) {
855                 log_error("Failed to create /dev/ptmx symlink: %m");
856                 return -errno;
857         }
858
859         return 0;
860 }
861
862 static int setup_dev_console(const char *dest, const char *console) {
863         struct stat st;
864         _cleanup_free_ char *to = NULL;
865         int r;
866         _cleanup_umask_ mode_t u;
867
868         assert(dest);
869         assert(console);
870
871         u = umask(0000);
872
873         if (stat(console, &st) < 0) {
874                 log_error("Failed to stat %s: %m", console);
875                 return -errno;
876
877         } else if (!S_ISCHR(st.st_mode)) {
878                 log_error("/dev/console is not a char device");
879                 return -EIO;
880         }
881
882         r = chmod_and_chown(console, 0600, 0, 0);
883         if (r < 0) {
884                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
885                 return r;
886         }
887
888         if (asprintf(&to, "%s/dev/console", dest) < 0)
889                 return log_oom();
890
891         /* We need to bind mount the right tty to /dev/console since
892          * ptys can only exist on pts file systems. To have something
893          * to bind mount things on we create a device node first, that
894          * has the right major/minor (note that the major minor
895          * doesn't actually matter here, since we mount it over
896          * anyway). */
897
898         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
899                 log_error("mknod() for /dev/console failed: %m");
900                 return -errno;
901         }
902
903         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
904                 log_error("Bind mount for /dev/console failed: %m");
905                 return -errno;
906         }
907
908         return 0;
909 }
910
911 static int setup_kmsg(const char *dest, int kmsg_socket) {
912         _cleanup_free_ char *from = NULL, *to = NULL;
913         int r, fd, k;
914         _cleanup_umask_ mode_t u;
915         union {
916                 struct cmsghdr cmsghdr;
917                 uint8_t buf[CMSG_SPACE(sizeof(int))];
918         } control = {};
919         struct msghdr mh = {
920                 .msg_control = &control,
921                 .msg_controllen = sizeof(control),
922         };
923         struct cmsghdr *cmsg;
924
925         assert(dest);
926         assert(kmsg_socket >= 0);
927
928         u = umask(0000);
929
930         /* We create the kmsg FIFO as /dev/kmsg, but immediately
931          * delete it after bind mounting it to /proc/kmsg. While FIFOs
932          * on the reading side behave very similar to /proc/kmsg,
933          * their writing side behaves differently from /dev/kmsg in
934          * that writing blocks when nothing is reading. In order to
935          * avoid any problems with containers deadlocking due to this
936          * we simply make /dev/kmsg unavailable to the container. */
937         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
938             asprintf(&to, "%s/proc/kmsg", dest) < 0)
939                 return log_oom();
940
941         if (mkfifo(from, 0600) < 0) {
942                 log_error("mkfifo() for /dev/kmsg failed: %m");
943                 return -errno;
944         }
945
946         r = chmod_and_chown(from, 0600, 0, 0);
947         if (r < 0) {
948                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
949                 return r;
950         }
951
952         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
953                 log_error("Bind mount for /proc/kmsg failed: %m");
954                 return -errno;
955         }
956
957         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
958         if (fd < 0) {
959                 log_error("Failed to open fifo: %m");
960                 return -errno;
961         }
962
963         cmsg = CMSG_FIRSTHDR(&mh);
964         cmsg->cmsg_level = SOL_SOCKET;
965         cmsg->cmsg_type = SCM_RIGHTS;
966         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
967         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
968
969         mh.msg_controllen = cmsg->cmsg_len;
970
971         /* Store away the fd in the socket, so that it stays open as
972          * long as we run the child */
973         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
974         close_nointr_nofail(fd);
975
976         if (k < 0) {
977                 log_error("Failed to send FIFO fd: %m");
978                 return -errno;
979         }
980
981         /* And now make the FIFO unavailable as /dev/kmsg... */
982         unlink(from);
983         return 0;
984 }
985
986 static int setup_hostname(void) {
987
988         if (arg_share_system)
989                 return 0;
990
991         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
992                 return -errno;
993
994         return 0;
995 }
996
997 static int setup_journal(const char *directory) {
998         sd_id128_t machine_id, this_id;
999         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
1000         char *id;
1001         int r;
1002
1003         p = strappend(directory, "/etc/machine-id");
1004         if (!p)
1005                 return log_oom();
1006
1007         r = read_one_line_file(p, &b);
1008         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
1009                 return 0;
1010         else if (r < 0) {
1011                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
1012                 return r;
1013         }
1014
1015         id = strstrip(b);
1016         if (isempty(id) && arg_link_journal == LINK_AUTO)
1017                 return 0;
1018
1019         /* Verify validity */
1020         r = sd_id128_from_string(id, &machine_id);
1021         if (r < 0) {
1022                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
1023                 return r;
1024         }
1025
1026         r = sd_id128_get_machine(&this_id);
1027         if (r < 0) {
1028                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
1029                 return r;
1030         }
1031
1032         if (sd_id128_equal(machine_id, this_id)) {
1033                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
1034                          "Host and machine ids are equal (%s): refusing to link journals", id);
1035                 if (arg_link_journal == LINK_AUTO)
1036                         return 0;
1037                 return
1038                         -EEXIST;
1039         }
1040
1041         if (arg_link_journal == LINK_NO)
1042                 return 0;
1043
1044         free(p);
1045         p = strappend("/var/log/journal/", id);
1046         q = strjoin(directory, "/var/log/journal/", id, NULL);
1047         if (!p || !q)
1048                 return log_oom();
1049
1050         if (path_is_mount_point(p, false) > 0) {
1051                 if (arg_link_journal != LINK_AUTO) {
1052                         log_error("%s: already a mount point, refusing to use for journal", p);
1053                         return -EEXIST;
1054                 }
1055
1056                 return 0;
1057         }
1058
1059         if (path_is_mount_point(q, false) > 0) {
1060                 if (arg_link_journal != LINK_AUTO) {
1061                         log_error("%s: already a mount point, refusing to use for journal", q);
1062                         return -EEXIST;
1063                 }
1064
1065                 return 0;
1066         }
1067
1068         r = readlink_and_make_absolute(p, &d);
1069         if (r >= 0) {
1070                 if ((arg_link_journal == LINK_GUEST ||
1071                      arg_link_journal == LINK_AUTO) &&
1072                     path_equal(d, q)) {
1073
1074                         r = mkdir_p(q, 0755);
1075                         if (r < 0)
1076                                 log_warning("failed to create directory %s: %m", q);
1077                         return 0;
1078                 }
1079
1080                 if (unlink(p) < 0) {
1081                         log_error("Failed to remove symlink %s: %m", p);
1082                         return -errno;
1083                 }
1084         } else if (r == -EINVAL) {
1085
1086                 if (arg_link_journal == LINK_GUEST &&
1087                     rmdir(p) < 0) {
1088
1089                         if (errno == ENOTDIR) {
1090                                 log_error("%s already exists and is neither a symlink nor a directory", p);
1091                                 return r;
1092                         } else {
1093                                 log_error("Failed to remove %s: %m", p);
1094                                 return -errno;
1095                         }
1096                 }
1097         } else if (r != -ENOENT) {
1098                 log_error("readlink(%s) failed: %m", p);
1099                 return r;
1100         }
1101
1102         if (arg_link_journal == LINK_GUEST) {
1103
1104                 if (symlink(q, p) < 0) {
1105                         log_error("Failed to symlink %s to %s: %m", q, p);
1106                         return -errno;
1107                 }
1108
1109                 r = mkdir_p(q, 0755);
1110                 if (r < 0)
1111                         log_warning("failed to create directory %s: %m", q);
1112                 return 0;
1113         }
1114
1115         if (arg_link_journal == LINK_HOST) {
1116                 r = mkdir_p(p, 0755);
1117                 if (r < 0) {
1118                         log_error("Failed to create %s: %m", p);
1119                         return r;
1120                 }
1121
1122         } else if (access(p, F_OK) < 0)
1123                 return 0;
1124
1125         if (dir_is_empty(q) == 0) {
1126                 log_error("%s not empty.", q);
1127                 return -ENOTEMPTY;
1128         }
1129
1130         r = mkdir_p(q, 0755);
1131         if (r < 0) {
1132                 log_error("Failed to create %s: %m", q);
1133                 return r;
1134         }
1135
1136         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
1137                 log_error("Failed to bind mount journal from host into guest: %m");
1138                 return -errno;
1139         }
1140
1141         return 0;
1142 }
1143
1144 static int setup_kdbus(const char *dest, const char *path) {
1145         const char *p;
1146
1147         if (!path)
1148                 return 0;
1149
1150         p = strappenda(dest, "/dev/kdbus");
1151         if (mkdir(p, 0755) < 0) {
1152                 log_error("Failed to create kdbus path: %m");
1153                 return  -errno;
1154         }
1155
1156         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1157                 log_error("Failed to mount kdbus domain path: %m");
1158                 return -errno;
1159         }
1160
1161         return 0;
1162 }
1163
1164 static int drop_capabilities(void) {
1165         return capability_bounding_set_drop(~arg_retain, false);
1166 }
1167
1168 static int register_machine(pid_t pid) {
1169         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1170         _cleanup_bus_unref_ sd_bus *bus = NULL;
1171         int r;
1172
1173         if (!arg_register)
1174                 return 0;
1175
1176         r = sd_bus_default_system(&bus);
1177         if (r < 0) {
1178                 log_error("Failed to open system bus: %s", strerror(-r));
1179                 return r;
1180         }
1181
1182         if (arg_keep_unit) {
1183                 r = sd_bus_call_method(
1184                                 bus,
1185                                 "org.freedesktop.machine1",
1186                                 "/org/freedesktop/machine1",
1187                                 "org.freedesktop.machine1.Manager",
1188                                 "RegisterMachine",
1189                                 &error,
1190                                 NULL,
1191                                 "sayssus",
1192                                 arg_machine,
1193                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1194                                 "nspawn",
1195                                 "container",
1196                                 (uint32_t) pid,
1197                                 strempty(arg_directory));
1198         } else {
1199                 _cleanup_bus_message_unref_ sd_bus_message *m = NULL;
1200
1201                 r = sd_bus_message_new_method_call(
1202                                 bus,
1203                                 &m,
1204                                 "org.freedesktop.machine1",
1205                                 "/org/freedesktop/machine1",
1206                                 "org.freedesktop.machine1.Manager",
1207                                 "CreateMachine");
1208                 if (r < 0) {
1209                         log_error("Failed to create message: %s", strerror(-r));
1210                         return r;
1211                 }
1212
1213                 r = sd_bus_message_append(
1214                                 m,
1215                                 "sayssus",
1216                                 arg_machine,
1217                                 SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1218                                 "nspawn",
1219                                 "container",
1220                                 (uint32_t) pid,
1221                                 strempty(arg_directory));
1222                 if (r < 0) {
1223                         log_error("Failed to append message arguments: %s", strerror(-r));
1224                         return r;
1225                 }
1226
1227                 r = sd_bus_message_open_container(m, 'a', "(sv)");
1228                 if (r < 0) {
1229                         log_error("Failed to open container: %s", strerror(-r));
1230                         return r;
1231                 }
1232
1233                 if (!isempty(arg_slice)) {
1234                         r = sd_bus_message_append(m, "(sv)", "Slice", "s", arg_slice);
1235                         if (r < 0) {
1236                                 log_error("Failed to append slice: %s", strerror(-r));
1237                                 return r;
1238                         }
1239                 }
1240
1241                 r = sd_bus_message_append(m, "(sv)", "DevicePolicy", "s", "strict");
1242                 if (r < 0) {
1243                         log_error("Failed to add device policy: %s", strerror(-r));
1244                         return r;
1245                 }
1246
1247                 r = sd_bus_message_append(m, "(sv)", "DeviceAllow", "a(ss)", 8,
1248                                           /* Allow the container to
1249                                            * access and create the API
1250                                            * device nodes, so that
1251                                            * PrivateDevices= in the
1252                                            * container can work
1253                                            * fine */
1254                                           "/dev/null", "rwm",
1255                                           "/dev/zero", "rwm",
1256                                           "/dev/full", "rwm",
1257                                           "/dev/random", "rwm",
1258                                           "/dev/urandom", "rwm",
1259                                           "/dev/tty", "rwm",
1260                                           /* Allow the container
1261                                            * access to ptys. However,
1262                                            * do not permit the
1263                                            * container to ever create
1264                                            * these device nodes. */
1265                                           "/dev/pts/ptmx", "rw",
1266                                           "char-pts", "rw");
1267                 if (r < 0) {
1268                         log_error("Failed to add device whitelist: %s", strerror(-r));
1269                         return r;
1270                 }
1271
1272                 r = sd_bus_message_close_container(m);
1273                 if (r < 0) {
1274                         log_error("Failed to close container: %s", strerror(-r));
1275                         return r;
1276                 }
1277
1278                 r = sd_bus_call(bus, m, 0, &error, NULL);
1279         }
1280
1281         if (r < 0) {
1282                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1283                 return r;
1284         }
1285
1286         return 0;
1287 }
1288
1289 static int terminate_machine(pid_t pid) {
1290         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1291         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1292         _cleanup_bus_unref_ sd_bus *bus = NULL;
1293         const char *path;
1294         int r;
1295
1296         if (!arg_register)
1297                 return 0;
1298
1299         r = sd_bus_default_system(&bus);
1300         if (r < 0) {
1301                 log_error("Failed to open system bus: %s", strerror(-r));
1302                 return r;
1303         }
1304
1305         r = sd_bus_call_method(
1306                         bus,
1307                         "org.freedesktop.machine1",
1308                         "/org/freedesktop/machine1",
1309                         "org.freedesktop.machine1.Manager",
1310                         "GetMachineByPID",
1311                         &error,
1312                         &reply,
1313                         "u",
1314                         (uint32_t) pid);
1315         if (r < 0) {
1316                 /* Note that the machine might already have been
1317                  * cleaned up automatically, hence don't consider it a
1318                  * failure if we cannot get the machine object. */
1319                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1320                 return 0;
1321         }
1322
1323         r = sd_bus_message_read(reply, "o", &path);
1324         if (r < 0)
1325                 return bus_log_parse_error(r);
1326
1327         r = sd_bus_call_method(
1328                         bus,
1329                         "org.freedesktop.machine1",
1330                         path,
1331                         "org.freedesktop.machine1.Machine",
1332                         "Terminate",
1333                         &error,
1334                         NULL,
1335                         NULL);
1336         if (r < 0) {
1337                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1338                 return 0;
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int reset_audit_loginuid(void) {
1345         _cleanup_free_ char *p = NULL;
1346         int r;
1347
1348         if (arg_share_system)
1349                 return 0;
1350
1351         r = read_one_line_file("/proc/self/loginuid", &p);
1352         if (r == -ENOENT)
1353                 return 0;
1354         if (r < 0) {
1355                 log_error("Failed to read /proc/self/loginuid: %s", strerror(-r));
1356                 return r;
1357         }
1358
1359         /* Already reset? */
1360         if (streq(p, "4294967295"))
1361                 return 0;
1362
1363         r = write_string_file("/proc/self/loginuid", "4294967295");
1364         if (r < 0) {
1365                 log_error("Failed to reset audit login UID. This probably means that your kernel is too\n"
1366                           "old and you have audit enabled. Note that the auditing subsystem is known to\n"
1367                           "be incompatible with containers on old kernels. Please make sure to upgrade\n"
1368                           "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
1369                           "using systemd-nspawn. Sleeping for 5s... (%s)\n", strerror(-r));
1370
1371                 sleep(5);
1372         }
1373
1374         return 0;
1375 }
1376
1377 static int setup_veth(pid_t pid, char iface_name[IFNAMSIZ]) {
1378         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1379         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1380         int r;
1381
1382         if (!arg_private_network)
1383                 return 0;
1384
1385         if (!arg_network_veth)
1386                 return 0;
1387
1388         /* Use two different interface name prefixes depending whether
1389          * we are in bridge mode or not. */
1390         if (arg_network_bridge)
1391                 memcpy(iface_name, "vb-", 3);
1392         else
1393                 memcpy(iface_name, "ve-", 3);
1394
1395         strncpy(iface_name+3, arg_machine, IFNAMSIZ - 3);
1396
1397         r = sd_rtnl_open(&rtnl, 0);
1398         if (r < 0) {
1399                 log_error("Failed to connect to netlink: %s", strerror(-r));
1400                 return r;
1401         }
1402
1403         r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1404         if (r < 0) {
1405                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1406                 return r;
1407         }
1408
1409         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, iface_name);
1410         if (r < 0) {
1411                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1412                 return r;
1413         }
1414
1415         r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1416         if (r < 0) {
1417                 log_error("Failed to open netlink container: %s", strerror(-r));
1418                 return r;
1419         }
1420
1421         r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "veth");
1422         if (r < 0) {
1423                 log_error("Failed to append netlink kind: %s", strerror(-r));
1424                 return r;
1425         }
1426
1427         r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1428         if (r < 0) {
1429                 log_error("Failed to open netlink container: %s", strerror(-r));
1430                 return r;
1431         }
1432
1433         r = sd_rtnl_message_open_container(m, VETH_INFO_PEER);
1434         if (r < 0) {
1435                 log_error("Failed to open netlink container: %s", strerror(-r));
1436                 return r;
1437         }
1438
1439         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, "host0");
1440         if (r < 0) {
1441                 log_error("Failed to add netlink interface name: %s", strerror(-r));
1442                 return r;
1443         }
1444
1445         r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1446         if (r < 0) {
1447                 log_error("Failed to add netlink namespace field: %s", strerror(-r));
1448                 return r;
1449         }
1450
1451         r = sd_rtnl_message_close_container(m);
1452         if (r < 0) {
1453                 log_error("Failed to close netlink container: %s", strerror(-r));
1454                 return r;
1455         }
1456
1457         r = sd_rtnl_message_close_container(m);
1458         if (r < 0) {
1459                 log_error("Failed to close netlink container: %s", strerror(-r));
1460                 return r;
1461         }
1462
1463         r = sd_rtnl_message_close_container(m);
1464         if (r < 0) {
1465                 log_error("Failed to close netlink container: %s", strerror(-r));
1466                 return r;
1467         }
1468
1469         r = sd_rtnl_call(rtnl, m, 0, NULL);
1470         if (r < 0) {
1471                 log_error("Failed to add new veth interfaces: %s", strerror(-r));
1472                 return r;
1473         }
1474
1475         return 0;
1476 }
1477
1478 static int setup_bridge(const char veth_name[]) {
1479         _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1480         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1481         int r, bridge;
1482
1483         if (!arg_private_network)
1484                 return 0;
1485
1486         if (!arg_network_veth)
1487                 return 0;
1488
1489         if (!arg_network_bridge)
1490                 return 0;
1491
1492         bridge = (int) if_nametoindex(arg_network_bridge);
1493         if (bridge <= 0) {
1494                 log_error("Failed to resolve interface %s: %m", arg_network_bridge);
1495                 return -errno;
1496         }
1497
1498         r = sd_rtnl_open(&rtnl, 0);
1499         if (r < 0) {
1500                 log_error("Failed to connect to netlink: %s", strerror(-r));
1501                 return r;
1502         }
1503
1504         r = sd_rtnl_message_new_link(rtnl, &m, RTM_SETLINK, 0);
1505         if (r < 0) {
1506                 log_error("Failed to allocate netlink message: %s", strerror(-r));
1507                 return r;
1508         }
1509
1510         r = sd_rtnl_message_append_string(m, IFLA_IFNAME, veth_name);
1511         if (r < 0) {
1512                 log_error("Failed to add netlink interface name field: %s", strerror(-r));
1513                 return r;
1514         }
1515
1516         r = sd_rtnl_message_append_u32(m, IFLA_MASTER, bridge);
1517         if (r < 0) {
1518                 log_error("Failed to add netlink master field: %s", strerror(-r));
1519                 return r;
1520         }
1521
1522         r = sd_rtnl_call(rtnl, m, 0, NULL);
1523         if (r < 0) {
1524                 log_error("Failed to add veth interface to bridge: %s", strerror(-r));
1525                 return r;
1526         }
1527
1528         return 0;
1529 }
1530
1531 static int parse_interface(struct udev *udev, const char *name) {
1532         _cleanup_udev_device_unref_ struct udev_device *d = NULL;
1533         char ifi_str[2 + DECIMAL_STR_MAX(int)];
1534         int ifi;
1535
1536         ifi = (int) if_nametoindex(name);
1537         if (ifi <= 0) {
1538                 log_error("Failed to resolve interface %s: %m", name);
1539                 return -errno;
1540         }
1541
1542         sprintf(ifi_str, "n%i", ifi);
1543         d = udev_device_new_from_device_id(udev, ifi_str);
1544         if (!d) {
1545                 log_error("Failed to get udev device for interface %s: %m", name);
1546                 return -errno;
1547         }
1548
1549         if (udev_device_get_is_initialized(d) <= 0) {
1550                 log_error("Network interface %s is not initialized yet.", name);
1551                 return -EBUSY;
1552         }
1553
1554         return ifi;
1555 }
1556
1557 static int move_network_interfaces(pid_t pid) {
1558         _cleanup_udev_unref_ struct udev *udev = NULL;
1559         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1560         char **i;
1561         int r;
1562
1563         if (!arg_private_network)
1564                 return 0;
1565
1566         if (strv_isempty(arg_network_interfaces))
1567                 return 0;
1568
1569         r = sd_rtnl_open(&rtnl, 0);
1570         if (r < 0) {
1571                 log_error("Failed to connect to netlink: %s", strerror(-r));
1572                 return r;
1573         }
1574
1575         udev = udev_new();
1576         if (!udev) {
1577                 log_error("Failed to connect to udev.");
1578                 return -ENOMEM;
1579         }
1580
1581         STRV_FOREACH(i, arg_network_interfaces) {
1582                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1583                 int ifi;
1584
1585                 ifi = parse_interface(udev, *i);
1586                 if (ifi < 0)
1587                         return ifi;
1588
1589                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, ifi);
1590                 if (r < 0) {
1591                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1592                         return r;
1593                 }
1594
1595                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1596                 if (r < 0) {
1597                         log_error("Failed to append namespace PID to netlink message: %s", strerror(-r));
1598                         return r;
1599                 }
1600
1601                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1602                 if (r < 0) {
1603                         log_error("Failed to move interface %s to namespace: %s", *i, strerror(-r));
1604                         return r;
1605                 }
1606         }
1607
1608         return 0;
1609 }
1610
1611 static int setup_macvlan(pid_t pid) {
1612         _cleanup_udev_unref_ struct udev *udev = NULL;
1613         _cleanup_rtnl_unref_ sd_rtnl *rtnl = NULL;
1614         char **i;
1615         int r;
1616
1617         if (!arg_private_network)
1618                 return 0;
1619
1620         if (strv_isempty(arg_network_macvlan))
1621                 return 0;
1622
1623         r = sd_rtnl_open(&rtnl, 0);
1624         if (r < 0) {
1625                 log_error("Failed to connect to netlink: %s", strerror(-r));
1626                 return r;
1627         }
1628
1629         udev = udev_new();
1630         if (!udev) {
1631                 log_error("Failed to connect to udev.");
1632                 return -ENOMEM;
1633         }
1634
1635         STRV_FOREACH(i, arg_network_macvlan) {
1636                 _cleanup_rtnl_message_unref_ sd_rtnl_message *m = NULL;
1637                 _cleanup_free_ char *n = NULL;
1638                 int ifi;
1639
1640                 ifi = parse_interface(udev, *i);
1641                 if (ifi < 0)
1642                         return ifi;
1643
1644                 r = sd_rtnl_message_new_link(rtnl, &m, RTM_NEWLINK, 0);
1645                 if (r < 0) {
1646                         log_error("Failed to allocate netlink message: %s", strerror(-r));
1647                         return r;
1648                 }
1649
1650                 r = sd_rtnl_message_append_u32(m, IFLA_LINK, ifi);
1651                 if (r < 0) {
1652                         log_error("Failed to add netlink interface index: %s", strerror(-r));
1653                         return r;
1654                 }
1655
1656                 n = strappend("mv-", *i);
1657                 if (!n)
1658                         return log_oom();
1659
1660                 strshorten(n, IFNAMSIZ-1);
1661
1662                 r = sd_rtnl_message_append_string(m, IFLA_IFNAME, n);
1663                 if (r < 0) {
1664                         log_error("Failed to add netlink interface name: %s", strerror(-r));
1665                         return r;
1666                 }
1667
1668                 r = sd_rtnl_message_append_u32(m, IFLA_NET_NS_PID, pid);
1669                 if (r < 0) {
1670                         log_error("Failed to add netlink namespace field: %s", strerror(-r));
1671                         return r;
1672                 }
1673
1674                 r = sd_rtnl_message_open_container(m, IFLA_LINKINFO);
1675                 if (r < 0) {
1676                         log_error("Failed to open netlink container: %s", strerror(-r));
1677                         return r;
1678                 }
1679
1680                 r = sd_rtnl_message_append_string(m, IFLA_INFO_KIND, "macvlan");
1681                 if (r < 0) {
1682                         log_error("Failed to append netlink kind: %s", strerror(-r));
1683                         return r;
1684                 }
1685
1686                 r = sd_rtnl_message_open_container(m, IFLA_INFO_DATA);
1687                 if (r < 0) {
1688                         log_error("Failed to open netlink container: %s", strerror(-r));
1689                         return r;
1690                 }
1691
1692                 r = sd_rtnl_message_append_u32(m, IFLA_MACVLAN_MODE, MACVLAN_MODE_BRIDGE);
1693                 if (r < 0) {
1694                         log_error("Failed to append macvlan mode: %s", strerror(-r));
1695                         return r;
1696                 }
1697
1698                 r = sd_rtnl_message_close_container(m);
1699                 if (r < 0) {
1700                         log_error("Failed to close netlink container: %s", strerror(-r));
1701                         return r;
1702                 }
1703
1704                 r = sd_rtnl_message_close_container(m);
1705                 if (r < 0) {
1706                         log_error("Failed to close netlink container: %s", strerror(-r));
1707                         return r;
1708                 }
1709
1710                 r = sd_rtnl_call(rtnl, m, 0, NULL);
1711                 if (r < 0) {
1712                         log_error("Failed to add new macvlan interfaces: %s", strerror(-r));
1713                         return r;
1714                 }
1715         }
1716
1717         return 0;
1718 }
1719
1720 static int audit_still_doesnt_work_in_containers(void) {
1721
1722 #ifdef HAVE_SECCOMP
1723         scmp_filter_ctx seccomp;
1724         int r;
1725
1726         /*
1727            Audit is broken in containers, much of the userspace audit
1728            hookup will fail if running inside a container. We don't
1729            care and just turn off creation of audit sockets.
1730
1731            This will make socket(AF_NETLINK, *, NETLINK_AUDIT) fail
1732            with EAFNOSUPPORT which audit userspace uses as indication
1733            that audit is disabled in the kernel.
1734          */
1735
1736         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1737         if (!seccomp)
1738                 return log_oom();
1739
1740         r = seccomp_add_secondary_archs(seccomp);
1741         if (r < 0) {
1742                 log_error("Failed to add secondary archs to seccomp filter: %s", strerror(-r));
1743                 goto finish;
1744         }
1745
1746         r = seccomp_rule_add(
1747                         seccomp,
1748                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1749                         SCMP_SYS(socket),
1750                         2,
1751                         SCMP_A0(SCMP_CMP_EQ, AF_NETLINK),
1752                         SCMP_A2(SCMP_CMP_EQ, NETLINK_AUDIT));
1753         if (r < 0) {
1754                 log_error("Failed to add audit seccomp rule: %s", strerror(-r));
1755                 goto finish;
1756         }
1757
1758         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1759         if (r < 0) {
1760                 log_error("Failed to unset NO_NEW_PRIVS: %s", strerror(-r));
1761                 goto finish;
1762         }
1763
1764         r = seccomp_load(seccomp);
1765         if (r < 0)
1766                 log_error("Failed to install seccomp audit filter: %s", strerror(-r));
1767
1768 finish:
1769         seccomp_release(seccomp);
1770         return r;
1771 #else
1772         return 0;
1773 #endif
1774
1775 }
1776
1777 int main(int argc, char *argv[]) {
1778
1779         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1780         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1781         _cleanup_free_ char *kdbus_domain = NULL;
1782         _cleanup_fdset_free_ FDSet *fds = NULL;
1783         const char *console = NULL;
1784         int r = EXIT_FAILURE, k;
1785         int n_fd_passed;
1786         pid_t pid = 0;
1787         sigset_t mask;
1788         char veth_name[IFNAMSIZ];
1789
1790         log_parse_environment();
1791         log_open();
1792
1793         k = parse_argv(argc, argv);
1794         if (k < 0)
1795                 goto finish;
1796         else if (k == 0) {
1797                 r = EXIT_SUCCESS;
1798                 goto finish;
1799         }
1800
1801         if (arg_directory) {
1802                 char *p;
1803
1804                 p = path_make_absolute_cwd(arg_directory);
1805                 free(arg_directory);
1806                 arg_directory = p;
1807         } else
1808                 arg_directory = get_current_dir_name();
1809
1810         if (!arg_directory) {
1811                 log_error("Failed to determine path, please use -D.");
1812                 goto finish;
1813         }
1814
1815         path_kill_slashes(arg_directory);
1816
1817         if (!arg_machine) {
1818                 arg_machine = strdup(basename(arg_directory));
1819                 if (!arg_machine) {
1820                         log_oom();
1821                         goto finish;
1822                 }
1823
1824                 hostname_cleanup(arg_machine, false);
1825                 if (isempty(arg_machine)) {
1826                         log_error("Failed to determine machine name automatically, please use -M.");
1827                         goto finish;
1828                 }
1829         }
1830
1831         if (geteuid() != 0) {
1832                 log_error("Need to be root.");
1833                 goto finish;
1834         }
1835
1836         if (sd_booted() <= 0) {
1837                 log_error("Not running on a systemd system.");
1838                 goto finish;
1839         }
1840
1841         if (path_equal(arg_directory, "/")) {
1842                 log_error("Spawning container on root directory not supported.");
1843                 goto finish;
1844         }
1845
1846         if (arg_boot) {
1847                 if (path_is_os_tree(arg_directory) <= 0) {
1848                         log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1849                         goto finish;
1850                 }
1851         } else {
1852                 const char *p;
1853
1854                 p = strappenda(arg_directory,
1855                                argc > optind && path_is_absolute(argv[optind]) ? argv[optind] : "/usr/bin/");
1856                 if (access(p, F_OK) < 0) {
1857                         log_error("Directory %s lacks the binary to execute or doesn't look like a binary tree. Refusing.", arg_directory);
1858                         goto finish;
1859
1860                 }
1861         }
1862
1863         log_close();
1864         n_fd_passed = sd_listen_fds(false);
1865         if (n_fd_passed > 0) {
1866                 k = fdset_new_listen_fds(&fds, false);
1867                 if (k < 0) {
1868                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1869                         goto finish;
1870                 }
1871         }
1872         fdset_close_others(fds);
1873         log_open();
1874
1875         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1876         if (master < 0) {
1877                 log_error("Failed to acquire pseudo tty: %m");
1878                 goto finish;
1879         }
1880
1881         console = ptsname(master);
1882         if (!console) {
1883                 log_error("Failed to determine tty name: %m");
1884                 goto finish;
1885         }
1886
1887         if (!arg_quiet)
1888                 log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1889
1890         if (unlockpt(master) < 0) {
1891                 log_error("Failed to unlock tty: %m");
1892                 goto finish;
1893         }
1894
1895         if (access("/dev/kdbus/control", F_OK) >= 0) {
1896
1897                 if (arg_share_system) {
1898                         kdbus_domain = strdup("/dev/kdbus");
1899                         if (!kdbus_domain) {
1900                                 log_oom();
1901                                 goto finish;
1902                         }
1903                 } else {
1904                         const char *ns;
1905
1906                         ns = strappenda("machine-", arg_machine);
1907                         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1908                         if (r < 0)
1909                                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1910                         else
1911                                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1912                 }
1913         }
1914
1915         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1916                 log_error("Failed to create kmsg socket pair: %m");
1917                 goto finish;
1918         }
1919
1920         sd_notify(0, "READY=1");
1921
1922         assert_se(sigemptyset(&mask) == 0);
1923         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1924         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1925
1926         for (;;) {
1927                 siginfo_t status;
1928
1929                 sync_fd = eventfd(0, EFD_CLOEXEC);
1930                 if (sync_fd < 0) {
1931                         log_error("Failed to create event fd: %m");
1932                         goto finish;
1933                 }
1934
1935                 pid = syscall(__NR_clone,
1936                               SIGCHLD|CLONE_NEWNS|
1937                               (arg_share_system ? 0 : CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS)|
1938                               (arg_private_network ? CLONE_NEWNET : 0), NULL);
1939                 if (pid < 0) {
1940                         if (errno == EINVAL)
1941                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1942                         else
1943                                 log_error("clone() failed: %m");
1944
1945                         goto finish;
1946                 }
1947
1948                 if (pid == 0) {
1949                         /* child */
1950                         const char *home = NULL;
1951                         uid_t uid = (uid_t) -1;
1952                         gid_t gid = (gid_t) -1;
1953                         unsigned n_env = 2;
1954                         const char *envp[] = {
1955                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1956                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1957                                 NULL, /* TERM */
1958                                 NULL, /* HOME */
1959                                 NULL, /* USER */
1960                                 NULL, /* LOGNAME */
1961                                 NULL, /* container_uuid */
1962                                 NULL, /* LISTEN_FDS */
1963                                 NULL, /* LISTEN_PID */
1964                                 NULL
1965                         };
1966                         char **env_use;
1967                         eventfd_t x;
1968
1969                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1970                         if (envp[n_env])
1971                                 n_env ++;
1972
1973                         close_nointr_nofail(master);
1974                         master = -1;
1975
1976                         close_nointr(STDIN_FILENO);
1977                         close_nointr(STDOUT_FILENO);
1978                         close_nointr(STDERR_FILENO);
1979
1980                         close_nointr_nofail(kmsg_socket_pair[0]);
1981                         kmsg_socket_pair[0] = -1;
1982
1983                         reset_all_signal_handlers();
1984
1985                         assert_se(sigemptyset(&mask) == 0);
1986                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1987
1988                         k = open_terminal(console, O_RDWR);
1989                         if (k != STDIN_FILENO) {
1990                                 if (k >= 0) {
1991                                         close_nointr_nofail(k);
1992                                         k = -EINVAL;
1993                                 }
1994
1995                                 log_error("Failed to open console: %s", strerror(-k));
1996                                 goto child_fail;
1997                         }
1998
1999                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
2000                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
2001                                 log_error("Failed to duplicate console: %m");
2002                                 goto child_fail;
2003                         }
2004
2005                         if (setsid() < 0) {
2006                                 log_error("setsid() failed: %m");
2007                                 goto child_fail;
2008                         }
2009
2010                         if (reset_audit_loginuid() < 0)
2011                                 goto child_fail;
2012
2013                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
2014                                 log_error("PR_SET_PDEATHSIG failed: %m");
2015                                 goto child_fail;
2016                         }
2017
2018                         /* Mark everything as slave, so that we still
2019                          * receive mounts from the real root, but don't
2020                          * propagate mounts to the real root. */
2021                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
2022                                 log_error("MS_SLAVE|MS_REC failed: %m");
2023                                 goto child_fail;
2024                         }
2025
2026                         /* Turn directory into bind mount */
2027                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
2028                                 log_error("Failed to make bind mount.");
2029                                 goto child_fail;
2030                         }
2031
2032                         if (arg_read_only)
2033                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
2034                                         log_error("Failed to make read-only.");
2035                                         goto child_fail;
2036                                 }
2037
2038                         if (mount_all(arg_directory) < 0)
2039                                 goto child_fail;
2040
2041                         if (copy_devnodes(arg_directory) < 0)
2042                                 goto child_fail;
2043
2044                         if (setup_ptmx(arg_directory) < 0)
2045                                 goto child_fail;
2046
2047                         dev_setup(arg_directory);
2048
2049                         if (audit_still_doesnt_work_in_containers() < 0)
2050                                 goto child_fail;
2051
2052                         if (setup_dev_console(arg_directory, console) < 0)
2053                                 goto child_fail;
2054
2055                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
2056                                 goto child_fail;
2057
2058                         close_nointr_nofail(kmsg_socket_pair[1]);
2059                         kmsg_socket_pair[1] = -1;
2060
2061                         if (setup_boot_id(arg_directory) < 0)
2062                                 goto child_fail;
2063
2064                         if (setup_timezone(arg_directory) < 0)
2065                                 goto child_fail;
2066
2067                         if (setup_resolv_conf(arg_directory) < 0)
2068                                 goto child_fail;
2069
2070                         if (setup_journal(arg_directory) < 0)
2071                                 goto child_fail;
2072
2073                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
2074                                 goto child_fail;
2075
2076                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
2077                                 goto child_fail;
2078
2079                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
2080                                 goto child_fail;
2081
2082                         if (chdir(arg_directory) < 0) {
2083                                 log_error("chdir(%s) failed: %m", arg_directory);
2084                                 goto child_fail;
2085                         }
2086
2087                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
2088                                 log_error("mount(MS_MOVE) failed: %m");
2089                                 goto child_fail;
2090                         }
2091
2092                         if (chroot(".") < 0) {
2093                                 log_error("chroot() failed: %m");
2094                                 goto child_fail;
2095                         }
2096
2097                         if (chdir("/") < 0) {
2098                                 log_error("chdir() failed: %m");
2099                                 goto child_fail;
2100                         }
2101
2102                         umask(0022);
2103
2104                         if (arg_private_network)
2105                                 loopback_setup();
2106
2107                         if (drop_capabilities() < 0) {
2108                                 log_error("drop_capabilities() failed: %m");
2109                                 goto child_fail;
2110                         }
2111
2112                         if (arg_user) {
2113
2114                                 /* Note that this resolves user names
2115                                  * inside the container, and hence
2116                                  * accesses the NSS modules from the
2117                                  * container and not the host. This is
2118                                  * a bit weird... */
2119
2120                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
2121                                         log_error("get_user_creds() failed: %m");
2122                                         goto child_fail;
2123                                 }
2124
2125                                 if (mkdir_parents_label(home, 0775) < 0) {
2126                                         log_error("mkdir_parents_label() failed: %m");
2127                                         goto child_fail;
2128                                 }
2129
2130                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
2131                                         log_error("mkdir_safe_label() failed: %m");
2132                                         goto child_fail;
2133                                 }
2134
2135                                 if (initgroups((const char*)arg_user, gid) < 0) {
2136                                         log_error("initgroups() failed: %m");
2137                                         goto child_fail;
2138                                 }
2139
2140                                 if (setresgid(gid, gid, gid) < 0) {
2141                                         log_error("setregid() failed: %m");
2142                                         goto child_fail;
2143                                 }
2144
2145                                 if (setresuid(uid, uid, uid) < 0) {
2146                                         log_error("setreuid() failed: %m");
2147                                         goto child_fail;
2148                                 }
2149                         } else {
2150                                 /* Reset everything fully to 0, just in case */
2151
2152                                 if (setgroups(0, NULL) < 0) {
2153                                         log_error("setgroups() failed: %m");
2154                                         goto child_fail;
2155                                 }
2156
2157                                 if (setresgid(0, 0, 0) < 0) {
2158                                         log_error("setregid() failed: %m");
2159                                         goto child_fail;
2160                                 }
2161
2162                                 if (setresuid(0, 0, 0) < 0) {
2163                                         log_error("setreuid() failed: %m");
2164                                         goto child_fail;
2165                                 }
2166                         }
2167
2168                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
2169                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
2170                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
2171                                 log_oom();
2172                                 goto child_fail;
2173                         }
2174
2175                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
2176                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
2177                                         log_oom();
2178                                         goto child_fail;
2179                                 }
2180                         }
2181
2182                         if (fdset_size(fds) > 0) {
2183                                 k = fdset_cloexec(fds, false);
2184                                 if (k < 0) {
2185                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
2186                                         goto child_fail;
2187                                 }
2188
2189                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
2190                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
2191                                         log_oom();
2192                                         goto child_fail;
2193                                 }
2194                         }
2195
2196                         setup_hostname();
2197
2198                         if (arg_personality != 0xffffffffLU) {
2199                                 if (personality(arg_personality) < 0) {
2200                                         log_error("personality() failed: %m");
2201                                         goto child_fail;
2202                                 }
2203                         }
2204
2205                         eventfd_read(sync_fd, &x);
2206                         close_nointr_nofail(sync_fd);
2207                         sync_fd = -1;
2208
2209                         if (!strv_isempty(arg_setenv)) {
2210                                 char **n;
2211
2212                                 n = strv_env_merge(2, envp, arg_setenv);
2213                                 if (!n) {
2214                                         log_oom();
2215                                         goto child_fail;
2216                                 }
2217
2218                                 env_use = n;
2219                         } else
2220                                 env_use = (char**) envp;
2221
2222 #ifdef HAVE_SELINUX
2223                         if (arg_selinux_context)
2224                                 if (setexeccon((security_context_t) arg_selinux_context) < 0)
2225                                         log_error("setexeccon(\"%s\") failed: %m", arg_selinux_context);
2226 #endif
2227                         if (arg_boot) {
2228                                 char **a;
2229                                 size_t l;
2230
2231                                 /* Automatically search for the init system */
2232
2233                                 l = 1 + argc - optind;
2234                                 a = newa(char*, l + 1);
2235                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
2236
2237                                 a[0] = (char*) "/usr/lib/systemd/systemd";
2238                                 execve(a[0], a, env_use);
2239
2240                                 a[0] = (char*) "/lib/systemd/systemd";
2241                                 execve(a[0], a, env_use);
2242
2243                                 a[0] = (char*) "/sbin/init";
2244                                 execve(a[0], a, env_use);
2245                         } else if (argc > optind)
2246                                 execvpe(argv[optind], argv + optind, env_use);
2247                         else {
2248                                 chdir(home ? home : "/root");
2249                                 execle("/bin/bash", "-bash", NULL, env_use);
2250                                 execle("/bin/sh", "-sh", NULL, env_use);
2251                         }
2252
2253                         log_error("execv() failed: %m");
2254
2255                 child_fail:
2256                         _exit(EXIT_FAILURE);
2257                 }
2258
2259                 fdset_free(fds);
2260                 fds = NULL;
2261
2262                 r = register_machine(pid);
2263                 if (r < 0)
2264                         goto finish;
2265
2266                 r = move_network_interfaces(pid);
2267                 if (r < 0)
2268                         goto finish;
2269
2270                 r = setup_veth(pid, veth_name);
2271                 if (r < 0)
2272                         goto finish;
2273
2274                 r = setup_bridge(veth_name);
2275                 if (r < 0)
2276                         goto finish;
2277
2278                 r = setup_macvlan(pid);
2279                 if (r < 0)
2280                         goto finish;
2281
2282                 eventfd_write(sync_fd, 1);
2283                 close_nointr_nofail(sync_fd);
2284                 sync_fd = -1;
2285
2286                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
2287                 if (k < 0) {
2288                         r = EXIT_FAILURE;
2289                         break;
2290                 }
2291
2292                 if (!arg_quiet)
2293                         putc('\n', stdout);
2294
2295                 /* Kill if it is not dead yet anyway */
2296                 terminate_machine(pid);
2297
2298                 /* Redundant, but better safe than sorry */
2299                 kill(pid, SIGKILL);
2300
2301                 k = wait_for_terminate(pid, &status);
2302                 pid = 0;
2303
2304                 if (k < 0) {
2305                         r = EXIT_FAILURE;
2306                         break;
2307                 }
2308
2309                 if (status.si_code == CLD_EXITED) {
2310                         r = status.si_status;
2311                         if (status.si_status != 0) {
2312                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
2313                                 break;
2314                         }
2315
2316                         if (!arg_quiet)
2317                                 log_debug("Container %s exited successfully.", arg_machine);
2318                         break;
2319                 } else if (status.si_code == CLD_KILLED &&
2320                            status.si_status == SIGINT) {
2321
2322                         if (!arg_quiet)
2323                                 log_info("Container %s has been shut down.", arg_machine);
2324                         r = 0;
2325                         break;
2326                 } else if (status.si_code == CLD_KILLED &&
2327                            status.si_status == SIGHUP) {
2328
2329                         if (!arg_quiet)
2330                                 log_info("Container %s is being rebooted.", arg_machine);
2331                         continue;
2332                 } else if (status.si_code == CLD_KILLED ||
2333                            status.si_code == CLD_DUMPED) {
2334
2335                         log_error("Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2336                         r = EXIT_FAILURE;
2337                         break;
2338                 } else {
2339                         log_error("Container %s failed due to unknown reason.", arg_machine);
2340                         r = EXIT_FAILURE;
2341                         break;
2342                 }
2343         }
2344
2345 finish:
2346         if (pid > 0)
2347                 kill(pid, SIGKILL);
2348
2349         free(arg_directory);
2350         free(arg_machine);
2351         free(arg_user);
2352         strv_free(arg_setenv);
2353         strv_free(arg_network_interfaces);
2354         strv_free(arg_network_macvlan);
2355         strv_free(arg_bind);
2356         strv_free(arg_bind_ro);
2357
2358         return r;
2359 }