chiark / gitweb /
nspawn: always use default bus
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <termios.h>
37 #include <sys/signalfd.h>
38 #include <grp.h>
39 #include <linux/fs.h>
40 #include <sys/un.h>
41 #include <sys/socket.h>
42 #include <linux/netlink.h>
43 #include <sys/eventfd.h>
44 #if HAVE_SELINUX
45 #include <selinux/selinux.h>
46 #endif
47
48 #include "sd-daemon.h"
49 #include "sd-bus.h"
50 #include "sd-id128.h"
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "dev-setup.h"
62 #include "fdset.h"
63 #include "build.h"
64 #include "fileio.h"
65 #include "bus-util.h"
66 #include "bus-error.h"
67 #include "ptyfwd.h"
68 #include "bus-kernel.h"
69 #include "env-util.h"
70 #include "def.h"
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static sd_id128_t arg_uuid = {};
82 static char *arg_machine = NULL;
83 static char *arg_process_label = NULL;
84 static char *arg_file_label = NULL;
85 static const char *arg_slice = NULL;
86 static bool arg_private_network = false;
87 static bool arg_read_only = false;
88 static bool arg_boot = false;
89 static LinkJournal arg_link_journal = LINK_AUTO;
90 static uint64_t arg_retain =
91         (1ULL << CAP_CHOWN) |
92         (1ULL << CAP_DAC_OVERRIDE) |
93         (1ULL << CAP_DAC_READ_SEARCH) |
94         (1ULL << CAP_FOWNER) |
95         (1ULL << CAP_FSETID) |
96         (1ULL << CAP_IPC_OWNER) |
97         (1ULL << CAP_KILL) |
98         (1ULL << CAP_LEASE) |
99         (1ULL << CAP_LINUX_IMMUTABLE) |
100         (1ULL << CAP_NET_BIND_SERVICE) |
101         (1ULL << CAP_NET_BROADCAST) |
102         (1ULL << CAP_NET_RAW) |
103         (1ULL << CAP_SETGID) |
104         (1ULL << CAP_SETFCAP) |
105         (1ULL << CAP_SETPCAP) |
106         (1ULL << CAP_SETUID) |
107         (1ULL << CAP_SYS_ADMIN) |
108         (1ULL << CAP_SYS_CHROOT) |
109         (1ULL << CAP_SYS_NICE) |
110         (1ULL << CAP_SYS_PTRACE) |
111         (1ULL << CAP_SYS_TTY_CONFIG) |
112         (1ULL << CAP_SYS_RESOURCE) |
113         (1ULL << CAP_SYS_BOOT) |
114         (1ULL << CAP_AUDIT_WRITE) |
115         (1ULL << CAP_AUDIT_CONTROL) |
116         (1ULL << CAP_MKNOD);
117 static char **arg_bind = NULL;
118 static char **arg_bind_ro = NULL;
119 static char **arg_setenv = NULL;
120
121 static int help(void) {
122
123         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
124                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
125                "  -h --help                 Show this help\n"
126                "     --version              Print version string\n"
127                "  -D --directory=NAME       Root directory for the container\n"
128                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
129                "  -u --user=USER            Run the command under specified user or uid\n"
130                "     --uuid=UUID            Set a specific machine UUID for the container\n"
131                "  -M --machine=NAME         Set the machine name for the container\n"
132                "  -S --slice=SLICE          Place the container in the specified slice\n"
133                "  -L --file-label=LABEL     Set the MAC file label to be used by tmpfs file\n"
134                "                            systems in the container\n"
135                "  -Z --process-label=LABEL  Set the MAC label to be used by processes in\n"
136                "                            the container\n"
137                "     --private-network      Disable network in container\n"
138                "     --read-only            Mount the root directory read-only\n"
139                "     --capability=CAP       In addition to the default, retain specified\n"
140                "                            capability\n"
141                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
142                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, host\n"
143                "  -j                        Equivalent to --link-journal=host\n"
144                "     --bind=PATH[:PATH]     Bind mount a file or directory from the host into\n"
145                "                            the container\n"
146                "     --bind-ro=PATH[:PATH]  Similar, but creates a read-only bind mount\n"
147                "     --setenv=NAME=VALUE    Pass an environment variable to PID 1\n",
148                program_invocation_short_name);
149
150         return 0;
151 }
152
153 static int parse_argv(int argc, char *argv[]) {
154
155         enum {
156                 ARG_VERSION = 0x100,
157                 ARG_PRIVATE_NETWORK,
158                 ARG_UUID,
159                 ARG_READ_ONLY,
160                 ARG_CAPABILITY,
161                 ARG_DROP_CAPABILITY,
162                 ARG_LINK_JOURNAL,
163                 ARG_BIND,
164                 ARG_BIND_RO,
165                 ARG_SETENV,
166         };
167
168         static const struct option options[] = {
169                 { "help",            no_argument,       NULL, 'h'                 },
170                 { "version",         no_argument,       NULL, ARG_VERSION         },
171                 { "directory",       required_argument, NULL, 'D'                 },
172                 { "user",            required_argument, NULL, 'u'                 },
173                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
174                 { "boot",            no_argument,       NULL, 'b'                 },
175                 { "uuid",            required_argument, NULL, ARG_UUID            },
176                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
177                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
178                 { "drop-capability", required_argument, NULL, ARG_DROP_CAPABILITY },
179                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
180                 { "bind",            required_argument, NULL, ARG_BIND            },
181                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
182                 { "machine",         required_argument, NULL, 'M'                 },
183                 { "slice",           required_argument, NULL, 'S'                 },
184                 { "setenv",          required_argument, NULL, ARG_SETENV          },
185                 { "process-label",   required_argument, NULL, 'Z'                 },
186                 { "file-label",      required_argument, NULL, 'L'                 },
187                 {}
188         };
189
190         int c, r;
191
192         assert(argc >= 0);
193         assert(argv);
194
195         while ((c = getopt_long(argc, argv, "+hD:u:bL:M:jS:Z:", options, NULL)) >= 0) {
196
197                 switch (c) {
198
199                 case 'h':
200                         return help();
201
202                 case ARG_VERSION:
203                         puts(PACKAGE_STRING);
204                         puts(SYSTEMD_FEATURES);
205                         return 0;
206
207                 case 'D':
208                         free(arg_directory);
209                         arg_directory = canonicalize_file_name(optarg);
210                         if (!arg_directory) {
211                                 log_error("Invalid root directory: %m");
212                                 return -ENOMEM;
213                         }
214
215                         break;
216
217                 case 'u':
218                         free(arg_user);
219                         arg_user = strdup(optarg);
220                         if (!arg_user)
221                                 return log_oom();
222
223                         break;
224
225                 case ARG_PRIVATE_NETWORK:
226                         arg_private_network = true;
227                         break;
228
229                 case 'b':
230                         arg_boot = true;
231                         break;
232
233                 case ARG_UUID:
234                         r = sd_id128_from_string(optarg, &arg_uuid);
235                         if (r < 0) {
236                                 log_error("Invalid UUID: %s", optarg);
237                                 return r;
238                         }
239                         break;
240
241                 case 'S':
242                         arg_slice = strdup(optarg);
243                         if (!arg_slice)
244                                 return log_oom();
245
246                         break;
247
248                 case 'M':
249                         if (!hostname_is_valid(optarg)) {
250                                 log_error("Invalid machine name: %s", optarg);
251                                 return -EINVAL;
252                         }
253
254                         free(arg_machine);
255                         arg_machine = strdup(optarg);
256                         if (!arg_machine)
257                                 return log_oom();
258
259                         break;
260
261                 case 'L':
262                         arg_file_label = optarg;
263                         break;
264
265                 case 'Z':
266                         arg_process_label = optarg;
267                         break;
268
269                 case ARG_READ_ONLY:
270                         arg_read_only = true;
271                         break;
272
273                 case ARG_CAPABILITY:
274                 case ARG_DROP_CAPABILITY: {
275                         char *state, *word;
276                         size_t length;
277
278                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
279                                 cap_value_t cap;
280                                 char *t;
281
282                                 t = strndup(word, length);
283                                 if (!t)
284                                         return log_oom();
285
286                                 if (cap_from_name(t, &cap) < 0) {
287                                         log_error("Failed to parse capability %s.", t);
288                                         free(t);
289                                         return -EINVAL;
290                                 }
291
292                                 free(t);
293
294                                 if (c == ARG_CAPABILITY)
295                                         arg_retain |= 1ULL << (uint64_t) cap;
296                                 else
297                                         arg_retain &= ~(1ULL << (uint64_t) cap);
298                         }
299
300                         break;
301                 }
302
303                 case 'j':
304                         arg_link_journal = LINK_GUEST;
305                         break;
306
307                 case ARG_LINK_JOURNAL:
308                         if (streq(optarg, "auto"))
309                                 arg_link_journal = LINK_AUTO;
310                         else if (streq(optarg, "no"))
311                                 arg_link_journal = LINK_NO;
312                         else if (streq(optarg, "guest"))
313                                 arg_link_journal = LINK_GUEST;
314                         else if (streq(optarg, "host"))
315                                 arg_link_journal = LINK_HOST;
316                         else {
317                                 log_error("Failed to parse link journal mode %s", optarg);
318                                 return -EINVAL;
319                         }
320
321                         break;
322
323                 case ARG_BIND:
324                 case ARG_BIND_RO: {
325                         _cleanup_free_ char *a = NULL, *b = NULL;
326                         char *e;
327                         char ***x;
328
329                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
330
331                         e = strchr(optarg, ':');
332                         if (e) {
333                                 a = strndup(optarg, e - optarg);
334                                 b = strdup(e + 1);
335                         } else {
336                                 a = strdup(optarg);
337                                 b = strdup(optarg);
338                         }
339
340                         if (!a || !b)
341                                 return log_oom();
342
343                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
344                                 log_error("Invalid bind mount specification: %s", optarg);
345                                 return -EINVAL;
346                         }
347
348                         r = strv_extend(x, a);
349                         if (r < 0)
350                                 return log_oom();
351
352                         r = strv_extend(x, b);
353                         if (r < 0)
354                                 return log_oom();
355
356                         break;
357                 }
358
359                 case ARG_SETENV: {
360                         char **n;
361
362                         if (!env_assignment_is_valid(optarg)) {
363                                 log_error("Environment variable assignment '%s' is not valid.", optarg);
364                                 return -EINVAL;
365                         }
366
367                         n = strv_env_set(arg_setenv, optarg);
368                         if (!n)
369                                 return log_oom();
370
371                         strv_free(arg_setenv);
372                         arg_setenv = n;
373                         break;
374                 }
375
376                 case '?':
377                         return -EINVAL;
378
379                 default:
380                         assert_not_reached("Unhandled option");
381                 }
382         }
383
384         return 1;
385 }
386
387 static int mount_all(const char *dest) {
388
389         typedef struct MountPoint {
390                 const char *what;
391                 const char *where;
392                 const char *type;
393                 const char *options;
394                 unsigned long flags;
395                 bool fatal;
396         } MountPoint;
397
398         static const MountPoint mount_table[] = {
399                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
400                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
401                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
402                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
403                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
404                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
405                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
406                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
407 #ifdef HAVE_SELINUX
408                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
409                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
410 #endif
411         };
412
413         unsigned k;
414         int r = 0;
415
416         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
417                 _cleanup_free_ char *where = NULL;
418 #ifdef HAVE_SELINUX
419                 _cleanup_free_ char *options = NULL;
420 #endif
421                 const char *o;
422                 int t;
423
424                 where = strjoin(dest, "/", mount_table[k].where, NULL);
425                 if (!where)
426                         return log_oom();
427
428                 t = path_is_mount_point(where, true);
429                 if (t < 0) {
430                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
431
432                         if (r == 0)
433                                 r = t;
434
435                         continue;
436                 }
437
438                 /* Skip this entry if it is not a remount. */
439                 if (mount_table[k].what && t > 0)
440                         continue;
441
442                 mkdir_p(where, 0755);
443
444 #ifdef HAVE_SELINUX
445                 if (arg_file_label && (streq_ptr(mount_table[k].what, "tmpfs") || streq_ptr(mount_table[k].what, "devpts"))) {
446                         options = strjoin(mount_table[k].options, ",context=\"", arg_file_label, "\"", NULL);
447                         if (!options)
448                                 return log_oom();
449
450                         o = options;
451                 } else
452 #endif
453                         o = mount_table[k].options;
454
455
456                 if (mount(mount_table[k].what,
457                           where,
458                           mount_table[k].type,
459                           mount_table[k].flags,
460                           o) < 0 &&
461                     mount_table[k].fatal) {
462
463                         log_error("mount(%s) failed: %m", where);
464
465                         if (r == 0)
466                                 r = -errno;
467                 }
468         }
469
470         return r;
471 }
472
473 static int mount_binds(const char *dest, char **l, unsigned long flags) {
474         char **x, **y;
475
476         STRV_FOREACH_PAIR(x, y, l) {
477                 char *where;
478                 struct stat source_st, dest_st;
479                 int r;
480
481                 if (stat(*x, &source_st) < 0) {
482                         log_error("failed to stat %s: %m", *x);
483                         return -errno;
484                 }
485
486                 where = strappenda(dest, *y);
487                 r = stat(where, &dest_st);
488                 if (r == 0) {
489                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
490                                 log_error("The file types of %s and %s do not match. Refusing bind mount",
491                                                 *x, where);
492                                 return -EINVAL;
493                         }
494                 } else if (errno == ENOENT) {
495                         r = mkdir_parents_label(where, 0755);
496                         if (r < 0) {
497                                 log_error("Failed to bind mount %s: %s", *x, strerror(-r));
498                                 return r;
499                         }
500                 } else {
501                         log_error("Failed to bind mount %s: %s", *x, strerror(errno));
502                         return -errno;
503                 }
504                 /* Create the mount point, but be conservative -- refuse to create block
505                 * and char devices. */
506                 if (S_ISDIR(source_st.st_mode))
507                         mkdir_label(where, 0755);
508                 else if (S_ISFIFO(source_st.st_mode))
509                         mkfifo(where, 0644);
510                 else if (S_ISSOCK(source_st.st_mode))
511                         mknod(where, 0644 | S_IFSOCK, 0);
512                 else if (S_ISREG(source_st.st_mode))
513                         touch(where);
514                 else {
515                         log_error("Refusing to create mountpoint for file: %s", *x);
516                         return -ENOTSUP;
517                 }
518
519                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
520                         log_error("mount(%s) failed: %m", where);
521                         return -errno;
522                 }
523
524                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
525                         log_error("mount(%s) failed: %m", where);
526                         return -errno;
527                 }
528         }
529
530         return 0;
531 }
532
533 static int setup_timezone(const char *dest) {
534         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
535         char *z, *y;
536         int r;
537
538         assert(dest);
539
540         /* Fix the timezone, if possible */
541         r = readlink_malloc("/etc/localtime", &p);
542         if (r < 0) {
543                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
544                 return 0;
545         }
546
547         z = path_startswith(p, "../usr/share/zoneinfo/");
548         if (!z)
549                 z = path_startswith(p, "/usr/share/zoneinfo/");
550         if (!z) {
551                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
552                 return 0;
553         }
554
555         where = strappend(dest, "/etc/localtime");
556         if (!where)
557                 return log_oom();
558
559         r = readlink_malloc(where, &q);
560         if (r >= 0) {
561                 y = path_startswith(q, "../usr/share/zoneinfo/");
562                 if (!y)
563                         y = path_startswith(q, "/usr/share/zoneinfo/");
564
565
566                 /* Already pointing to the right place? Then do nothing .. */
567                 if (y && streq(y, z))
568                         return 0;
569         }
570
571         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
572         if (!check)
573                 return log_oom();
574
575         if (access(check, F_OK) < 0) {
576                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
577                 return 0;
578         }
579
580         what = strappend("../usr/share/zoneinfo/", z);
581         if (!what)
582                 return log_oom();
583
584         unlink(where);
585         if (symlink(what, where) < 0) {
586                 log_error("Failed to correct timezone of container: %m");
587                 return 0;
588         }
589
590         return 0;
591 }
592
593 static int setup_resolv_conf(const char *dest) {
594         char _cleanup_free_ *where = NULL;
595
596         assert(dest);
597
598         if (arg_private_network)
599                 return 0;
600
601         /* Fix resolv.conf, if possible */
602         where = strappend(dest, "/etc/resolv.conf");
603         if (!where)
604                 return log_oom();
605
606         /* We don't really care for the results of this really. If it
607          * fails, it fails, but meh... */
608         copy_file("/etc/resolv.conf", where, O_TRUNC|O_NOFOLLOW);
609
610         return 0;
611 }
612
613 static int setup_boot_id(const char *dest) {
614         _cleanup_free_ char *from = NULL, *to = NULL;
615         sd_id128_t rnd;
616         char as_uuid[37];
617         int r;
618
619         assert(dest);
620
621         /* Generate a new randomized boot ID, so that each boot-up of
622          * the container gets a new one */
623
624         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
625         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
626         if (!from || !to)
627                 return log_oom();
628
629         r = sd_id128_randomize(&rnd);
630         if (r < 0) {
631                 log_error("Failed to generate random boot id: %s", strerror(-r));
632                 return r;
633         }
634
635         snprintf(as_uuid, sizeof(as_uuid),
636                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
637                  SD_ID128_FORMAT_VAL(rnd));
638         char_array_0(as_uuid);
639
640         r = write_string_file(from, as_uuid);
641         if (r < 0) {
642                 log_error("Failed to write boot id: %s", strerror(-r));
643                 return r;
644         }
645
646         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
647                 log_error("Failed to bind mount boot id: %m");
648                 r = -errno;
649         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
650                 log_warning("Failed to make boot id read-only: %m");
651
652         unlink(from);
653         return r;
654 }
655
656 static int copy_devnodes(const char *dest) {
657
658         static const char devnodes[] =
659                 "null\0"
660                 "zero\0"
661                 "full\0"
662                 "random\0"
663                 "urandom\0"
664                 "tty\0";
665
666         const char *d;
667         int r = 0;
668         _cleanup_umask_ mode_t u;
669
670         assert(dest);
671
672         u = umask(0000);
673
674         NULSTR_FOREACH(d, devnodes) {
675                 _cleanup_free_ char *from = NULL, *to = NULL;
676                 struct stat st;
677
678                 from = strappend("/dev/", d);
679                 to = strjoin(dest, "/dev/", d, NULL);
680                 if (!from || !to)
681                         return log_oom();
682
683                 if (stat(from, &st) < 0) {
684
685                         if (errno != ENOENT) {
686                                 log_error("Failed to stat %s: %m", from);
687                                 return -errno;
688                         }
689
690                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
691
692                         log_error("%s is not a char or block device, cannot copy", from);
693                         return -EIO;
694
695                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
696
697                         log_error("mknod(%s) failed: %m", dest);
698                         return  -errno;
699                 }
700         }
701
702         return r;
703 }
704
705 static int setup_ptmx(const char *dest) {
706         _cleanup_free_ char *p = NULL;
707
708         p = strappend(dest, "/dev/ptmx");
709         if (!p)
710                 return log_oom();
711
712         if (symlink("pts/ptmx", p) < 0) {
713                 log_error("Failed to create /dev/ptmx symlink: %m");
714                 return -errno;
715         }
716
717         return 0;
718 }
719
720 static int setup_dev_console(const char *dest, const char *console) {
721         struct stat st;
722         _cleanup_free_ char *to = NULL;
723         int r;
724         _cleanup_umask_ mode_t u;
725
726         assert(dest);
727         assert(console);
728
729         u = umask(0000);
730
731         if (stat(console, &st) < 0) {
732                 log_error("Failed to stat %s: %m", console);
733                 return -errno;
734
735         } else if (!S_ISCHR(st.st_mode)) {
736                 log_error("/dev/console is not a char device");
737                 return -EIO;
738         }
739
740         r = chmod_and_chown(console, 0600, 0, 0);
741         if (r < 0) {
742                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
743                 return r;
744         }
745
746         if (asprintf(&to, "%s/dev/console", dest) < 0)
747                 return log_oom();
748
749         /* We need to bind mount the right tty to /dev/console since
750          * ptys can only exist on pts file systems. To have something
751          * to bind mount things on we create a device node first, that
752          * has the right major/minor (note that the major minor
753          * doesn't actually matter here, since we mount it over
754          * anyway). */
755
756         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
757                 log_error("mknod() for /dev/console failed: %m");
758                 return -errno;
759         }
760
761         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
762                 log_error("Bind mount for /dev/console failed: %m");
763                 return -errno;
764         }
765
766         return 0;
767 }
768
769 static int setup_kmsg(const char *dest, int kmsg_socket) {
770         _cleanup_free_ char *from = NULL, *to = NULL;
771         int r, fd, k;
772         _cleanup_umask_ mode_t u;
773         union {
774                 struct cmsghdr cmsghdr;
775                 uint8_t buf[CMSG_SPACE(sizeof(int))];
776         } control = {};
777         struct msghdr mh = {
778                 .msg_control = &control,
779                 .msg_controllen = sizeof(control),
780         };
781         struct cmsghdr *cmsg;
782
783         assert(dest);
784         assert(kmsg_socket >= 0);
785
786         u = umask(0000);
787
788         /* We create the kmsg FIFO as /dev/kmsg, but immediately
789          * delete it after bind mounting it to /proc/kmsg. While FIFOs
790          * on the reading side behave very similar to /proc/kmsg,
791          * their writing side behaves differently from /dev/kmsg in
792          * that writing blocks when nothing is reading. In order to
793          * avoid any problems with containers deadlocking due to this
794          * we simply make /dev/kmsg unavailable to the container. */
795         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
796             asprintf(&to, "%s/proc/kmsg", dest) < 0)
797                 return log_oom();
798
799         if (mkfifo(from, 0600) < 0) {
800                 log_error("mkfifo() for /dev/kmsg failed: %m");
801                 return -errno;
802         }
803
804         r = chmod_and_chown(from, 0600, 0, 0);
805         if (r < 0) {
806                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
807                 return r;
808         }
809
810         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
811                 log_error("Bind mount for /proc/kmsg failed: %m");
812                 return -errno;
813         }
814
815         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
816         if (fd < 0) {
817                 log_error("Failed to open fifo: %m");
818                 return -errno;
819         }
820
821         cmsg = CMSG_FIRSTHDR(&mh);
822         cmsg->cmsg_level = SOL_SOCKET;
823         cmsg->cmsg_type = SCM_RIGHTS;
824         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
825         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
826
827         mh.msg_controllen = cmsg->cmsg_len;
828
829         /* Store away the fd in the socket, so that it stays open as
830          * long as we run the child */
831         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
832         close_nointr_nofail(fd);
833
834         if (k < 0) {
835                 log_error("Failed to send FIFO fd: %m");
836                 return -errno;
837         }
838
839         /* And now make the FIFO unavailable as /dev/kmsg... */
840         unlink(from);
841         return 0;
842 }
843
844 static int setup_hostname(void) {
845
846         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
847                 return -errno;
848
849         return 0;
850 }
851
852 static int setup_journal(const char *directory) {
853         sd_id128_t machine_id, this_id;
854         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
855         char *id;
856         int r;
857
858         p = strappend(directory, "/etc/machine-id");
859         if (!p)
860                 return log_oom();
861
862         r = read_one_line_file(p, &b);
863         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
864                 return 0;
865         else if (r < 0) {
866                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
867                 return r;
868         }
869
870         id = strstrip(b);
871         if (isempty(id) && arg_link_journal == LINK_AUTO)
872                 return 0;
873
874         /* Verify validity */
875         r = sd_id128_from_string(id, &machine_id);
876         if (r < 0) {
877                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
878                 return r;
879         }
880
881         r = sd_id128_get_machine(&this_id);
882         if (r < 0) {
883                 log_error("Failed to retrieve machine ID: %s", strerror(-r));
884                 return r;
885         }
886
887         if (sd_id128_equal(machine_id, this_id)) {
888                 log_full(arg_link_journal == LINK_AUTO ? LOG_WARNING : LOG_ERR,
889                          "Host and machine ids are equal (%s): refusing to link journals", id);
890                 if (arg_link_journal == LINK_AUTO)
891                         return 0;
892                 return
893                         -EEXIST;
894         }
895
896         if (arg_link_journal == LINK_NO)
897                 return 0;
898
899         free(p);
900         p = strappend("/var/log/journal/", id);
901         q = strjoin(directory, "/var/log/journal/", id, NULL);
902         if (!p || !q)
903                 return log_oom();
904
905         if (path_is_mount_point(p, false) > 0) {
906                 if (arg_link_journal != LINK_AUTO) {
907                         log_error("%s: already a mount point, refusing to use for journal", p);
908                         return -EEXIST;
909                 }
910
911                 return 0;
912         }
913
914         if (path_is_mount_point(q, false) > 0) {
915                 if (arg_link_journal != LINK_AUTO) {
916                         log_error("%s: already a mount point, refusing to use for journal", q);
917                         return -EEXIST;
918                 }
919
920                 return 0;
921         }
922
923         r = readlink_and_make_absolute(p, &d);
924         if (r >= 0) {
925                 if ((arg_link_journal == LINK_GUEST ||
926                      arg_link_journal == LINK_AUTO) &&
927                     path_equal(d, q)) {
928
929                         r = mkdir_p(q, 0755);
930                         if (r < 0)
931                                 log_warning("failed to create directory %s: %m", q);
932                         return 0;
933                 }
934
935                 if (unlink(p) < 0) {
936                         log_error("Failed to remove symlink %s: %m", p);
937                         return -errno;
938                 }
939         } else if (r == -EINVAL) {
940
941                 if (arg_link_journal == LINK_GUEST &&
942                     rmdir(p) < 0) {
943
944                         if (errno == ENOTDIR) {
945                                 log_error("%s already exists and is neither a symlink nor a directory", p);
946                                 return r;
947                         } else {
948                                 log_error("Failed to remove %s: %m", p);
949                                 return -errno;
950                         }
951                 }
952         } else if (r != -ENOENT) {
953                 log_error("readlink(%s) failed: %m", p);
954                 return r;
955         }
956
957         if (arg_link_journal == LINK_GUEST) {
958
959                 if (symlink(q, p) < 0) {
960                         log_error("Failed to symlink %s to %s: %m", q, p);
961                         return -errno;
962                 }
963
964                 r = mkdir_p(q, 0755);
965                 if (r < 0)
966                         log_warning("failed to create directory %s: %m", q);
967                 return 0;
968         }
969
970         if (arg_link_journal == LINK_HOST) {
971                 r = mkdir_p(p, 0755);
972                 if (r < 0) {
973                         log_error("Failed to create %s: %m", p);
974                         return r;
975                 }
976
977         } else if (access(p, F_OK) < 0)
978                 return 0;
979
980         if (dir_is_empty(q) == 0) {
981                 log_error("%s not empty.", q);
982                 return -ENOTEMPTY;
983         }
984
985         r = mkdir_p(q, 0755);
986         if (r < 0) {
987                 log_error("Failed to create %s: %m", q);
988                 return r;
989         }
990
991         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
992                 log_error("Failed to bind mount journal from host into guest: %m");
993                 return -errno;
994         }
995
996         return 0;
997 }
998
999 static int setup_kdbus(const char *dest, const char *path) {
1000         const char *p;
1001
1002         if (!path)
1003                 return 0;
1004
1005         p = strappenda(dest, "/dev/kdbus");
1006         if (mkdir(p, 0755) < 0) {
1007                 log_error("Failed to create kdbus path: %m");
1008                 return  -errno;
1009         }
1010
1011         if (mount(path, p, "bind", MS_BIND, NULL) < 0) {
1012                 log_error("Failed to mount kdbus domain path: %m");
1013                 return -errno;
1014         }
1015
1016         return 0;
1017 }
1018
1019 static int drop_capabilities(void) {
1020         return capability_bounding_set_drop(~arg_retain, false);
1021 }
1022
1023 static int register_machine(pid_t pid) {
1024         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1025         _cleanup_bus_unref_ sd_bus *bus = NULL;
1026         int r;
1027
1028         r = sd_bus_default_system(&bus);
1029         if (r < 0) {
1030                 log_error("Failed to open system bus: %s", strerror(-r));
1031                 return r;
1032         }
1033
1034         r = sd_bus_call_method(
1035                         bus,
1036                         "org.freedesktop.machine1",
1037                         "/org/freedesktop/machine1",
1038                         "org.freedesktop.machine1.Manager",
1039                         "CreateMachine",
1040                         &error,
1041                         NULL,
1042                         "sayssusa(sv)",
1043                         arg_machine,
1044                         SD_BUS_MESSAGE_APPEND_ID128(arg_uuid),
1045                         "nspawn",
1046                         "container",
1047                         (uint32_t) pid,
1048                         strempty(arg_directory),
1049                         !isempty(arg_slice), "Slice", "s", arg_slice);
1050         if (r < 0) {
1051                 log_error("Failed to register machine: %s", bus_error_message(&error, r));
1052                 return r;
1053         }
1054
1055         return 0;
1056 }
1057
1058 static int terminate_machine(pid_t pid) {
1059         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1060         _cleanup_bus_message_unref_ sd_bus_message *reply = NULL;
1061         _cleanup_bus_unref_ sd_bus *bus = NULL;
1062         const char *path;
1063         int r;
1064
1065         r = sd_bus_default_system(&bus);
1066         if (r < 0) {
1067                 log_error("Failed to open system bus: %s", strerror(-r));
1068                 return r;
1069         }
1070
1071         r = sd_bus_call_method(
1072                         bus,
1073                         "org.freedesktop.machine1",
1074                         "/org/freedesktop/machine1",
1075                         "org.freedesktop.machine1.Manager",
1076                         "GetMachineByPID",
1077                         &error,
1078                         &reply,
1079                         "u",
1080                         (uint32_t) pid);
1081         if (r < 0) {
1082                 /* Note that the machine might already have been
1083                  * cleaned up automatically, hence don't consider it a
1084                  * failure if we cannot get the machine object. */
1085                 log_debug("Failed to get machine: %s", bus_error_message(&error, r));
1086                 return 0;
1087         }
1088
1089         r = sd_bus_message_read(reply, "o", &path);
1090         if (r < 0)
1091                 return bus_log_parse_error(r);
1092
1093         r = sd_bus_call_method(
1094                         bus,
1095                         "org.freedesktop.machine1",
1096                         path,
1097                         "org.freedesktop.machine1.Machine",
1098                         "Terminate",
1099                         &error,
1100                         NULL,
1101                         NULL);
1102         if (r < 0) {
1103                 log_debug("Failed to terminate machine: %s", bus_error_message(&error, r));
1104                 return 0;
1105         }
1106
1107         return 0;
1108 }
1109
1110 static bool audit_enabled(void) {
1111         int fd;
1112
1113         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1114         if (fd >= 0) {
1115                 close_nointr_nofail(fd);
1116                 return true;
1117         }
1118         return false;
1119 }
1120
1121 int main(int argc, char *argv[]) {
1122         pid_t pid = 0;
1123         int r = EXIT_FAILURE, k;
1124         _cleanup_close_ int master = -1, kdbus_fd = -1, sync_fd = -1;
1125         int n_fd_passed;
1126         const char *console = NULL;
1127         sigset_t mask;
1128         _cleanup_close_pipe_ int kmsg_socket_pair[2] = { -1, -1 };
1129         _cleanup_fdset_free_ FDSet *fds = NULL;
1130         _cleanup_free_ char *kdbus_domain = NULL;
1131         const char *ns;
1132
1133         log_parse_environment();
1134         log_open();
1135
1136         k = parse_argv(argc, argv);
1137         if (k < 0)
1138                 goto finish;
1139         else if (k == 0) {
1140                 r = EXIT_SUCCESS;
1141                 goto finish;
1142         }
1143
1144         if (arg_directory) {
1145                 char *p;
1146
1147                 p = path_make_absolute_cwd(arg_directory);
1148                 free(arg_directory);
1149                 arg_directory = p;
1150         } else
1151                 arg_directory = get_current_dir_name();
1152
1153         if (!arg_directory) {
1154                 log_error("Failed to determine path, please use -D.");
1155                 goto finish;
1156         }
1157
1158         path_kill_slashes(arg_directory);
1159
1160         if (!arg_machine) {
1161                 arg_machine = strdup(basename(arg_directory));
1162                 if (!arg_machine) {
1163                         log_oom();
1164                         goto finish;
1165                 }
1166
1167                 hostname_cleanup(arg_machine, false);
1168                 if (isempty(arg_machine)) {
1169                         log_error("Failed to determine machine name automatically, please use -M.");
1170                         goto finish;
1171                 }
1172         }
1173
1174         if (geteuid() != 0) {
1175                 log_error("Need to be root.");
1176                 goto finish;
1177         }
1178
1179         if (sd_booted() <= 0) {
1180                 log_error("Not running on a systemd system.");
1181                 goto finish;
1182         }
1183
1184         if (arg_boot && audit_enabled()) {
1185                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1186                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1187                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1188                 sleep(5);
1189         }
1190
1191         if (path_equal(arg_directory, "/")) {
1192                 log_error("Spawning container on root directory not supported.");
1193                 goto finish;
1194         }
1195
1196         if (path_is_os_tree(arg_directory) <= 0) {
1197                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1198                 goto finish;
1199         }
1200
1201         log_close();
1202         n_fd_passed = sd_listen_fds(false);
1203         if (n_fd_passed > 0) {
1204                 k = fdset_new_listen_fds(&fds, false);
1205                 if (k < 0) {
1206                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1207                         goto finish;
1208                 }
1209         }
1210         fdset_close_others(fds);
1211         log_open();
1212
1213         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1214         if (master < 0) {
1215                 log_error("Failed to acquire pseudo tty: %m");
1216                 goto finish;
1217         }
1218
1219         console = ptsname(master);
1220         if (!console) {
1221                 log_error("Failed to determine tty name: %m");
1222                 goto finish;
1223         }
1224
1225         log_info("Spawning container %s on %s. Press ^] three times within 1s to abort execution.", arg_machine, arg_directory);
1226
1227         if (unlockpt(master) < 0) {
1228                 log_error("Failed to unlock tty: %m");
1229                 goto finish;
1230         }
1231
1232         ns = strappenda("machine-", arg_machine);
1233         kdbus_fd = bus_kernel_create_domain(ns, &kdbus_domain);
1234         if (r < 0)
1235                 log_debug("Failed to create kdbus domain: %s", strerror(-r));
1236         else
1237                 log_debug("Successfully created kdbus domain as %s", kdbus_domain);
1238
1239         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1240                 log_error("Failed to create kmsg socket pair: %m");
1241                 goto finish;
1242         }
1243
1244         sd_notify(0, "READY=1");
1245
1246         assert_se(sigemptyset(&mask) == 0);
1247         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1248         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1249
1250         for (;;) {
1251                 siginfo_t status;
1252
1253                 sync_fd = eventfd(0, EFD_CLOEXEC);
1254                 if (sync_fd < 0) {
1255                         log_error("Failed to create event fd: %m");
1256                         goto finish;
1257                 }
1258
1259                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1260                 if (pid < 0) {
1261                         if (errno == EINVAL)
1262                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1263                         else
1264                                 log_error("clone() failed: %m");
1265
1266                         goto finish;
1267                 }
1268
1269                 if (pid == 0) {
1270                         /* child */
1271                         const char *home = NULL;
1272                         uid_t uid = (uid_t) -1;
1273                         gid_t gid = (gid_t) -1;
1274                         unsigned n_env = 2;
1275                         const char *envp[] = {
1276                                 "PATH=" DEFAULT_PATH_SPLIT_USR,
1277                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1278                                 NULL, /* TERM */
1279                                 NULL, /* HOME */
1280                                 NULL, /* USER */
1281                                 NULL, /* LOGNAME */
1282                                 NULL, /* container_uuid */
1283                                 NULL, /* LISTEN_FDS */
1284                                 NULL, /* LISTEN_PID */
1285                                 NULL
1286                         };
1287                         char **env_use;
1288                         eventfd_t x;
1289
1290                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1291                         if (envp[n_env])
1292                                 n_env ++;
1293
1294                         close_nointr_nofail(master);
1295                         master = -1;
1296
1297                         close_nointr(STDIN_FILENO);
1298                         close_nointr(STDOUT_FILENO);
1299                         close_nointr(STDERR_FILENO);
1300
1301                         close_nointr_nofail(kmsg_socket_pair[0]);
1302                         kmsg_socket_pair[0] = -1;
1303
1304                         reset_all_signal_handlers();
1305
1306                         assert_se(sigemptyset(&mask) == 0);
1307                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1308
1309                         k = open_terminal(console, O_RDWR);
1310                         if (k != STDIN_FILENO) {
1311                                 if (k >= 0) {
1312                                         close_nointr_nofail(k);
1313                                         k = -EINVAL;
1314                                 }
1315
1316                                 log_error("Failed to open console: %s", strerror(-k));
1317                                 goto child_fail;
1318                         }
1319
1320                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1321                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1322                                 log_error("Failed to duplicate console: %m");
1323                                 goto child_fail;
1324                         }
1325
1326                         if (setsid() < 0) {
1327                                 log_error("setsid() failed: %m");
1328                                 goto child_fail;
1329                         }
1330
1331                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1332                                 log_error("PR_SET_PDEATHSIG failed: %m");
1333                                 goto child_fail;
1334                         }
1335
1336                         /* Mark everything as slave, so that we still
1337                          * receive mounts from the real root, but don't
1338                          * propagate mounts to the real root. */
1339                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1340                                 log_error("MS_SLAVE|MS_REC failed: %m");
1341                                 goto child_fail;
1342                         }
1343
1344                         /* Turn directory into bind mount */
1345                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1346                                 log_error("Failed to make bind mount.");
1347                                 goto child_fail;
1348                         }
1349
1350                         if (arg_read_only)
1351                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1352                                         log_error("Failed to make read-only.");
1353                                         goto child_fail;
1354                                 }
1355
1356                         if (mount_all(arg_directory) < 0)
1357                                 goto child_fail;
1358
1359                         if (copy_devnodes(arg_directory) < 0)
1360                                 goto child_fail;
1361
1362                         if (setup_ptmx(arg_directory) < 0)
1363                                 goto child_fail;
1364
1365                         dev_setup(arg_directory);
1366
1367                         if (setup_dev_console(arg_directory, console) < 0)
1368                                 goto child_fail;
1369
1370                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1371                                 goto child_fail;
1372
1373                         close_nointr_nofail(kmsg_socket_pair[1]);
1374                         kmsg_socket_pair[1] = -1;
1375
1376                         if (setup_boot_id(arg_directory) < 0)
1377                                 goto child_fail;
1378
1379                         if (setup_timezone(arg_directory) < 0)
1380                                 goto child_fail;
1381
1382                         if (setup_resolv_conf(arg_directory) < 0)
1383                                 goto child_fail;
1384
1385                         if (setup_journal(arg_directory) < 0)
1386                                 goto child_fail;
1387
1388                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1389                                 goto child_fail;
1390
1391                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1392                                 goto child_fail;
1393
1394                         if (setup_kdbus(arg_directory, kdbus_domain) < 0)
1395                                 goto child_fail;
1396
1397                         if (chdir(arg_directory) < 0) {
1398                                 log_error("chdir(%s) failed: %m", arg_directory);
1399                                 goto child_fail;
1400                         }
1401
1402                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1403                                 log_error("mount(MS_MOVE) failed: %m");
1404                                 goto child_fail;
1405                         }
1406
1407                         if (chroot(".") < 0) {
1408                                 log_error("chroot() failed: %m");
1409                                 goto child_fail;
1410                         }
1411
1412                         if (chdir("/") < 0) {
1413                                 log_error("chdir() failed: %m");
1414                                 goto child_fail;
1415                         }
1416
1417                         umask(0022);
1418
1419                         loopback_setup();
1420
1421                         if (drop_capabilities() < 0) {
1422                                 log_error("drop_capabilities() failed: %m");
1423                                 goto child_fail;
1424                         }
1425
1426                         if (arg_user) {
1427
1428                                 /* Note that this resolves user names
1429                                  * inside the container, and hence
1430                                  * accesses the NSS modules from the
1431                                  * container and not the host. This is
1432                                  * a bit weird... */
1433
1434                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1435                                         log_error("get_user_creds() failed: %m");
1436                                         goto child_fail;
1437                                 }
1438
1439                                 if (mkdir_parents_label(home, 0775) < 0) {
1440                                         log_error("mkdir_parents_label() failed: %m");
1441                                         goto child_fail;
1442                                 }
1443
1444                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1445                                         log_error("mkdir_safe_label() failed: %m");
1446                                         goto child_fail;
1447                                 }
1448
1449                                 if (initgroups((const char*)arg_user, gid) < 0) {
1450                                         log_error("initgroups() failed: %m");
1451                                         goto child_fail;
1452                                 }
1453
1454                                 if (setresgid(gid, gid, gid) < 0) {
1455                                         log_error("setregid() failed: %m");
1456                                         goto child_fail;
1457                                 }
1458
1459                                 if (setresuid(uid, uid, uid) < 0) {
1460                                         log_error("setreuid() failed: %m");
1461                                         goto child_fail;
1462                                 }
1463                         } else {
1464                                 /* Reset everything fully to 0, just in case */
1465
1466                                 if (setgroups(0, NULL) < 0) {
1467                                         log_error("setgroups() failed: %m");
1468                                         goto child_fail;
1469                                 }
1470
1471                                 if (setresgid(0, 0, 0) < 0) {
1472                                         log_error("setregid() failed: %m");
1473                                         goto child_fail;
1474                                 }
1475
1476                                 if (setresuid(0, 0, 0) < 0) {
1477                                         log_error("setreuid() failed: %m");
1478                                         goto child_fail;
1479                                 }
1480                         }
1481
1482                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1483                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1484                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1485                                 log_oom();
1486                                 goto child_fail;
1487                         }
1488
1489                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1490                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1491                                         log_oom();
1492                                         goto child_fail;
1493                                 }
1494                         }
1495
1496                         if (fdset_size(fds) > 0) {
1497                                 k = fdset_cloexec(fds, false);
1498                                 if (k < 0) {
1499                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1500                                         goto child_fail;
1501                                 }
1502
1503                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1504                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1505                                         log_oom();
1506                                         goto child_fail;
1507                                 }
1508                         }
1509
1510                         setup_hostname();
1511
1512                         eventfd_read(sync_fd, &x);
1513                         close_nointr_nofail(sync_fd);
1514                         sync_fd = -1;
1515
1516                         if (!strv_isempty(arg_setenv)) {
1517                                 char **n;
1518
1519                                 n = strv_env_merge(2, envp, arg_setenv);
1520                                 if (!n) {
1521                                         log_oom();
1522                                         goto child_fail;
1523                                 }
1524
1525                                 env_use = n;
1526                         } else
1527                                 env_use = (char**) envp;
1528
1529 #if HAVE_SELINUX
1530                         if (arg_process_label)
1531                                 if (setexeccon(arg_process_label) < 0)
1532                                         log_error("setexeccon(\"%s\") failed: %m", arg_process_label);
1533 #endif
1534                         if (arg_boot) {
1535                                 char **a;
1536                                 size_t l;
1537
1538                                 /* Automatically search for the init system */
1539
1540                                 l = 1 + argc - optind;
1541                                 a = newa(char*, l + 1);
1542                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1543
1544                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1545                                 execve(a[0], a, env_use);
1546
1547                                 a[0] = (char*) "/lib/systemd/systemd";
1548                                 execve(a[0], a, env_use);
1549
1550                                 a[0] = (char*) "/sbin/init";
1551                                 execve(a[0], a, env_use);
1552                         } else if (argc > optind)
1553                                 execvpe(argv[optind], argv + optind, env_use);
1554                         else {
1555                                 chdir(home ? home : "/root");
1556                                 execle("/bin/bash", "-bash", NULL, env_use);
1557                         }
1558
1559                         log_error("execv() failed: %m");
1560
1561                 child_fail:
1562                         _exit(EXIT_FAILURE);
1563                 }
1564
1565                 fdset_free(fds);
1566                 fds = NULL;
1567
1568                 r = register_machine(pid);
1569                 if (r < 0)
1570                         goto finish;
1571
1572                 eventfd_write(sync_fd, 1);
1573                 close_nointr_nofail(sync_fd);
1574                 sync_fd = -1;
1575
1576                 k = process_pty(master, &mask, arg_boot ? pid : 0, SIGRTMIN+3);
1577                 if (k < 0) {
1578                         r = EXIT_FAILURE;
1579                         break;
1580                 }
1581
1582                 putc('\n', stdout);
1583
1584                 /* Kill if it is not dead yet anyway */
1585                 terminate_machine(pid);
1586
1587                 /* Redundant, but better safe than sorry */
1588                 kill(pid, SIGKILL);
1589
1590                 k = wait_for_terminate(pid, &status);
1591                 pid = 0;
1592
1593                 if (k < 0) {
1594                         r = EXIT_FAILURE;
1595                         break;
1596                 }
1597
1598                 if (status.si_code == CLD_EXITED) {
1599                         r = status.si_status;
1600                         if (status.si_status != 0) {
1601                                 log_error("Container %s failed with error code %i.", arg_machine, status.si_status);
1602                                 break;
1603                         }
1604
1605                         log_debug("Container %s exited successfully.", arg_machine);
1606                         break;
1607                 } else if (status.si_code == CLD_KILLED &&
1608                            status.si_status == SIGINT) {
1609                         log_info("Container %s has been shut down.", arg_machine);
1610                         r = 0;
1611                         break;
1612                 } else if (status.si_code == CLD_KILLED &&
1613                            status.si_status == SIGHUP) {
1614                         log_info("Container %s is being rebooted.", arg_machine);
1615                         continue;
1616                 } else if (status.si_code == CLD_KILLED ||
1617                            status.si_code == CLD_DUMPED) {
1618
1619                         log_error("Container %s terminated by signal %s.", arg_machine,  signal_to_string(status.si_status));
1620                         r = EXIT_FAILURE;
1621                         break;
1622                 } else {
1623                         log_error("Container %s failed due to unknown reason.", arg_machine);
1624                         r = EXIT_FAILURE;
1625                         break;
1626                 }
1627         }
1628
1629 finish:
1630         if (pid > 0)
1631                 kill(pid, SIGKILL);
1632
1633         free(arg_directory);
1634         free(arg_machine);
1635         free(arg_setenv);
1636
1637         return r;
1638 }