chiark / gitweb /
nspawn: environment would be truncated with TERM unset
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 typedef enum LinkJournal {
64         LINK_NO,
65         LINK_AUTO,
66         LINK_HOST,
67         LINK_GUEST
68 } LinkJournal;
69
70 static char *arg_directory = NULL;
71 static char *arg_user = NULL;
72 static char **arg_controllers = NULL;
73 static char *arg_uuid = NULL;
74 static bool arg_private_network = false;
75 static bool arg_read_only = false;
76 static bool arg_boot = false;
77 static LinkJournal arg_link_journal = LINK_AUTO;
78 static uint64_t arg_retain =
79         (1ULL << CAP_CHOWN) |
80         (1ULL << CAP_DAC_OVERRIDE) |
81         (1ULL << CAP_DAC_READ_SEARCH) |
82         (1ULL << CAP_FOWNER) |
83         (1ULL << CAP_FSETID) |
84         (1ULL << CAP_IPC_OWNER) |
85         (1ULL << CAP_KILL) |
86         (1ULL << CAP_LEASE) |
87         (1ULL << CAP_LINUX_IMMUTABLE) |
88         (1ULL << CAP_NET_BIND_SERVICE) |
89         (1ULL << CAP_NET_BROADCAST) |
90         (1ULL << CAP_NET_RAW) |
91         (1ULL << CAP_SETGID) |
92         (1ULL << CAP_SETFCAP) |
93         (1ULL << CAP_SETPCAP) |
94         (1ULL << CAP_SETUID) |
95         (1ULL << CAP_SYS_ADMIN) |
96         (1ULL << CAP_SYS_CHROOT) |
97         (1ULL << CAP_SYS_NICE) |
98         (1ULL << CAP_SYS_PTRACE) |
99         (1ULL << CAP_SYS_TTY_CONFIG) |
100         (1ULL << CAP_SYS_RESOURCE) |
101         (1ULL << CAP_SYS_BOOT) |
102         (1ULL << CAP_AUDIT_WRITE) |
103         (1ULL << CAP_AUDIT_CONTROL);
104 static char **arg_bind = NULL;
105 static char **arg_bind_ro = NULL;
106
107 static int help(void) {
108
109         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
110                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
111                "  -h --help                Show this help\n"
112                "  --version                Print version string\n"
113                "  -D --directory=NAME      Root directory for the container\n"
114                "  -b --boot                Boot up full system (i.e. invoke init)\n"
115                "  -u --user=USER           Run the command under specified user or uid\n"
116                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
117                "                           cgroup hierarchies\n"
118                "     --uuid=UUID           Set a specific machine UUID for the container\n"
119                "     --private-network     Disable network in container\n"
120                "     --read-only           Mount the root directory read-only\n"
121                "     --capability=CAP      In addition to the default, retain specified\n"
122                "                           capability\n"
123                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
124                "  -j                       Equivalent to --link-journal=host\n"
125                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
126                "                           the container\n"
127                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
128                program_invocation_short_name);
129
130         return 0;
131 }
132
133 static int parse_argv(int argc, char *argv[]) {
134
135         enum {
136                 ARG_VERSION = 0x100,
137                 ARG_PRIVATE_NETWORK,
138                 ARG_UUID,
139                 ARG_READ_ONLY,
140                 ARG_CAPABILITY,
141                 ARG_LINK_JOURNAL,
142                 ARG_BIND,
143                 ARG_BIND_RO
144         };
145
146         static const struct option options[] = {
147                 { "help",            no_argument,       NULL, 'h'                 },
148                 { "version",         no_argument,       NULL, ARG_VERSION         },
149                 { "directory",       required_argument, NULL, 'D'                 },
150                 { "user",            required_argument, NULL, 'u'                 },
151                 { "controllers",     required_argument, NULL, 'C'                 },
152                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
153                 { "boot",            no_argument,       NULL, 'b'                 },
154                 { "uuid",            required_argument, NULL, ARG_UUID            },
155                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
156                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
157                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
158                 { "bind",            required_argument, NULL, ARG_BIND            },
159                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
160                 { NULL,              0,                 NULL, 0                   }
161         };
162
163         int c;
164
165         assert(argc >= 0);
166         assert(argv);
167
168         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
169
170                 switch (c) {
171
172                 case 'h':
173                         help();
174                         return 0;
175
176                 case ARG_VERSION:
177                         puts(PACKAGE_STRING);
178                         puts(SYSTEMD_FEATURES);
179                         return 0;
180
181                 case 'D':
182                         free(arg_directory);
183                         arg_directory = canonicalize_file_name(optarg);
184                         if (!arg_directory) {
185                                 log_error("Failed to canonicalize root directory.");
186                                 return -ENOMEM;
187                         }
188
189                         break;
190
191                 case 'u':
192                         free(arg_user);
193                         if (!(arg_user = strdup(optarg))) {
194                                 log_error("Failed to duplicate user name.");
195                                 return -ENOMEM;
196                         }
197
198                         break;
199
200                 case 'C':
201                         strv_free(arg_controllers);
202                         arg_controllers = strv_split(optarg, ",");
203                         if (!arg_controllers) {
204                                 log_error("Failed to split controllers list.");
205                                 return -ENOMEM;
206                         }
207                         strv_uniq(arg_controllers);
208
209                         break;
210
211                 case ARG_PRIVATE_NETWORK:
212                         arg_private_network = true;
213                         break;
214
215                 case 'b':
216                         arg_boot = true;
217                         break;
218
219                 case ARG_UUID:
220                         arg_uuid = optarg;
221                         break;
222
223                 case ARG_READ_ONLY:
224                         arg_read_only = true;
225                         break;
226
227                 case ARG_CAPABILITY: {
228                         char *state, *word;
229                         size_t length;
230
231                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
232                                 cap_value_t cap;
233                                 char *t;
234
235                                 t = strndup(word, length);
236                                 if (!t)
237                                         return log_oom();
238
239                                 if (cap_from_name(t, &cap) < 0) {
240                                         log_error("Failed to parse capability %s.", t);
241                                         free(t);
242                                         return -EINVAL;
243                                 }
244
245                                 free(t);
246                                 arg_retain |= 1ULL << (uint64_t) cap;
247                         }
248
249                         break;
250                 }
251
252                 case 'j':
253                         arg_link_journal = LINK_GUEST;
254                         break;
255
256                 case ARG_LINK_JOURNAL:
257                         if (streq(optarg, "auto"))
258                                 arg_link_journal = LINK_AUTO;
259                         else if (streq(optarg, "no"))
260                                 arg_link_journal = LINK_NO;
261                         else if (streq(optarg, "guest"))
262                                 arg_link_journal = LINK_GUEST;
263                         else if (streq(optarg, "host"))
264                                 arg_link_journal = LINK_HOST;
265                         else {
266                                 log_error("Failed to parse link journal mode %s", optarg);
267                                 return -EINVAL;
268                         }
269
270                         break;
271
272                 case ARG_BIND:
273                 case ARG_BIND_RO: {
274                         _cleanup_free_ char *a = NULL, *b = NULL;
275                         char *e;
276                         char ***x;
277                         int r;
278
279                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
280
281                         e = strchr(optarg, ':');
282                         if (e) {
283                                 a = strndup(optarg, e - optarg);
284                                 b = strdup(e + 1);
285                         } else {
286                                 a = strdup(optarg);
287                                 b = strdup(optarg);
288                         }
289
290                         if (!a || !b)
291                                 return log_oom();
292
293                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
294                                 log_error("Invalid bind mount specification: %s", optarg);
295                                 return -EINVAL;
296                         }
297
298                         r = strv_extend(x, a);
299                         if (r < 0)
300                                 return r;
301
302                         r = strv_extend(x, b);
303                         if (r < 0)
304                                 return r;
305
306                         break;
307                 }
308
309                 case '?':
310                         return -EINVAL;
311
312                 default:
313                         log_error("Unknown option code %c", c);
314                         return -EINVAL;
315                 }
316         }
317
318         return 1;
319 }
320
321 static int mount_all(const char *dest) {
322
323         typedef struct MountPoint {
324                 const char *what;
325                 const char *where;
326                 const char *type;
327                 const char *options;
328                 unsigned long flags;
329                 bool fatal;
330         } MountPoint;
331
332         static const MountPoint mount_table[] = {
333                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
334                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
335                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
336                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
337                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
338                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
339                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
340                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
341 #ifdef HAVE_SELINUX
342                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
343                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
344 #endif
345         };
346
347         unsigned k;
348         int r = 0;
349
350         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
351                 char _cleanup_free_ *where = NULL;
352                 int t;
353
354                 where = strjoin(dest, "/", mount_table[k].where, NULL);
355                 if (!where)
356                         return log_oom();
357
358                 t = path_is_mount_point(where, true);
359                 if (t < 0) {
360                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
361
362                         if (r == 0)
363                                 r = t;
364
365                         continue;
366                 }
367
368                 /* Skip this entry if it is not a remount. */
369                 if (mount_table[k].what && t > 0)
370                         continue;
371
372                 mkdir_p(where, 0755);
373
374                 if (mount(mount_table[k].what,
375                           where,
376                           mount_table[k].type,
377                           mount_table[k].flags,
378                           mount_table[k].options) < 0 &&
379                     mount_table[k].fatal) {
380
381                         log_error("mount(%s) failed: %m", where);
382
383                         if (r == 0)
384                                 r = -errno;
385                 }
386         }
387
388         return r;
389 }
390
391 static int mount_binds(const char *dest, char **l, unsigned long flags) {
392         char **x, **y;
393
394         STRV_FOREACH_PAIR(x, y, l) {
395                 _cleanup_free_ char *where = NULL;
396
397                 where = strjoin(dest, "/", *y, NULL);
398                 if (!where)
399                         return log_oom();
400
401                 mkdir_p_label(where, 0755);
402
403                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
404                         log_error("mount(%s) failed: %m", where);
405                         return -errno;
406                 }
407
408                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
409                         log_error("mount(%s) failed: %m", where);
410                         return -errno;
411                 }
412         }
413
414         return 0;
415 }
416
417 static int setup_timezone(const char *dest) {
418         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
419         char *z, *y;
420         int r;
421
422         assert(dest);
423
424         /* Fix the timezone, if possible */
425         r = readlink_malloc("/etc/localtime", &p);
426         if (r < 0) {
427                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
428                 return 0;
429         }
430
431         z = path_startswith(p, "../usr/share/zoneinfo/");
432         if (!z)
433                 z = path_startswith(p, "/usr/share/zoneinfo/");
434         if (!z) {
435                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
436                 return 0;
437         }
438
439         where = strappend(dest, "/etc/localtime");
440         if (!where)
441                 return log_oom();
442
443         r = readlink_malloc(where, &q);
444         if (r >= 0) {
445                 y = path_startswith(q, "../usr/share/zoneinfo/");
446                 if (!y)
447                         y = path_startswith(q, "/usr/share/zoneinfo/");
448
449
450                 /* Already pointing to the right place? Then do nothing .. */
451                 if (y && streq(y, z))
452                         return 0;
453         }
454
455         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
456         if (!check)
457                 return log_oom();
458
459         if (access(check, F_OK) < 0) {
460                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
461                 return 0;
462         }
463
464         what = strappend("../usr/share/zoneinfo/", z);
465         if (!what)
466                 return log_oom();
467
468         unlink(where);
469         if (symlink(what, where) < 0) {
470                 log_error("Failed to correct timezone of container: %m");
471                 return 0;
472         }
473
474         return 0;
475 }
476
477 static int setup_resolv_conf(const char *dest) {
478         char *where;
479
480         assert(dest);
481
482         if (arg_private_network)
483                 return 0;
484
485         /* Fix resolv.conf, if possible */
486         where = strappend(dest, "/etc/resolv.conf");
487         if (!where)
488                 return log_oom();
489
490         /* We don't really care for the results of this really. If it
491          * fails, it fails, but meh... */
492         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
493                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
494
495         free(where);
496
497         return 0;
498 }
499
500 static int setup_boot_id(const char *dest) {
501         char _cleanup_free_ *from = NULL, *to = NULL;
502         sd_id128_t rnd;
503         char as_uuid[37];
504         int r;
505
506         assert(dest);
507
508         /* Generate a new randomized boot ID, so that each boot-up of
509          * the container gets a new one */
510
511         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
512         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
513         if (!from || !to)
514                 return log_oom();
515
516         r = sd_id128_randomize(&rnd);
517         if (r < 0) {
518                 log_error("Failed to generate random boot id: %s", strerror(-r));
519                 return r;
520         }
521
522         snprintf(as_uuid, sizeof(as_uuid),
523                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
524                  SD_ID128_FORMAT_VAL(rnd));
525         char_array_0(as_uuid);
526
527         r = write_one_line_file(from, as_uuid);
528         if (r < 0) {
529                 log_error("Failed to write boot id: %s", strerror(-r));
530                 return r;
531         }
532
533         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
534                 log_error("Failed to bind mount boot id: %m");
535                 r = -errno;
536         } else
537                 mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
538
539         unlink(from);
540         return r;
541 }
542
543 static int copy_devnodes(const char *dest) {
544
545         static const char devnodes[] =
546                 "null\0"
547                 "zero\0"
548                 "full\0"
549                 "random\0"
550                 "urandom\0"
551                 "tty\0"
552                 "ptmx\0";
553
554         const char *d;
555         int r = 0;
556         mode_t _cleanup_umask_ u;
557
558         assert(dest);
559
560         u = umask(0000);
561
562         NULSTR_FOREACH(d, devnodes) {
563                 struct stat st;
564                 char _cleanup_free_ *from = NULL, *to = NULL;
565
566                 asprintf(&from, "/dev/%s", d);
567                 asprintf(&to, "%s/dev/%s", dest, d);
568
569                 if (!from || !to) {
570                         log_oom();
571
572                         if (r == 0)
573                                 r = -ENOMEM;
574
575                         break;
576                 }
577
578                 if (stat(from, &st) < 0) {
579
580                         if (errno != ENOENT) {
581                                 log_error("Failed to stat %s: %m", from);
582                                 if (r == 0)
583                                         r = -errno;
584                         }
585
586                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
587
588                         log_error("%s is not a char or block device, cannot copy", from);
589                         if (r == 0)
590                                 r = -EIO;
591
592                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
593
594                         log_error("mknod(%s) failed: %m", dest);
595                         if (r == 0)
596                                 r = -errno;
597                 }
598         }
599
600         return r;
601 }
602
603 static int setup_dev_console(const char *dest, const char *console) {
604         struct stat st;
605         char _cleanup_free_ *to = NULL;
606         int r;
607         mode_t _cleanup_umask_ u;
608
609         assert(dest);
610         assert(console);
611
612         u = umask(0000);
613
614         if (stat(console, &st) < 0) {
615                 log_error("Failed to stat %s: %m", console);
616                 return -errno;
617
618         } else if (!S_ISCHR(st.st_mode)) {
619                 log_error("/dev/console is not a char device");
620                 return -EIO;
621         }
622
623         r = chmod_and_chown(console, 0600, 0, 0);
624         if (r < 0) {
625                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
626                 return r;
627         }
628
629         if (asprintf(&to, "%s/dev/console", dest) < 0)
630                 return log_oom();
631
632         /* We need to bind mount the right tty to /dev/console since
633          * ptys can only exist on pts file systems. To have something
634          * to bind mount things on we create a device node first, that
635          * has the right major/minor (note that the major minor
636          * doesn't actually matter here, since we mount it over
637          * anyway). */
638
639         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
640                 log_error("mknod() for /dev/console failed: %m");
641                 return -errno;
642         }
643
644         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
645                 log_error("Bind mount for /dev/console failed: %m");
646                 return -errno;
647         }
648
649         return 0;
650 }
651
652 static int setup_kmsg(const char *dest, int kmsg_socket) {
653         char _cleanup_free_ *from = NULL, *to = NULL;
654         int r, fd, k;
655         mode_t _cleanup_umask_ u;
656         union {
657                 struct cmsghdr cmsghdr;
658                 uint8_t buf[CMSG_SPACE(sizeof(int))];
659         } control;
660         struct msghdr mh;
661         struct cmsghdr *cmsg;
662
663         assert(dest);
664         assert(kmsg_socket >= 0);
665
666         u = umask(0000);
667
668         /* We create the kmsg FIFO as /dev/kmsg, but immediately
669          * delete it after bind mounting it to /proc/kmsg. While FIFOs
670          * on the reading side behave very similar to /proc/kmsg,
671          * their writing side behaves differently from /dev/kmsg in
672          * that writing blocks when nothing is reading. In order to
673          * avoid any problems with containers deadlocking due to this
674          * we simply make /dev/kmsg unavailable to the container. */
675         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
676             asprintf(&to, "%s/proc/kmsg", dest) < 0)
677                 return log_oom();
678
679         if (mkfifo(from, 0600) < 0) {
680                 log_error("mkfifo() for /dev/kmsg failed: %m");
681                 return -errno;
682         }
683
684         r = chmod_and_chown(from, 0600, 0, 0);
685         if (r < 0) {
686                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
687                 return r;
688         }
689
690         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
691                 log_error("Bind mount for /proc/kmsg failed: %m");
692                 return -errno;
693         }
694
695         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
696         if (fd < 0) {
697                 log_error("Failed to open fifo: %m");
698                 return -errno;
699         }
700
701         zero(mh);
702         zero(control);
703
704         mh.msg_control = &control;
705         mh.msg_controllen = sizeof(control);
706
707         cmsg = CMSG_FIRSTHDR(&mh);
708         cmsg->cmsg_level = SOL_SOCKET;
709         cmsg->cmsg_type = SCM_RIGHTS;
710         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
711         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
712
713         mh.msg_controllen = cmsg->cmsg_len;
714
715         /* Store away the fd in the socket, so that it stays open as
716          * long as we run the child */
717         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
718         close_nointr_nofail(fd);
719
720         if (k < 0) {
721                 log_error("Failed to send FIFO fd: %m");
722                 return -errno;
723         }
724
725         /* And now make the FIFO unavailable as /dev/kmsg... */
726         unlink(from);
727         return 0;
728 }
729
730 static int setup_hostname(void) {
731         char *hn;
732         int r = 0;
733
734         hn = path_get_file_name(arg_directory);
735         if (hn) {
736                 hn = strdup(hn);
737                 if (!hn)
738                         return -ENOMEM;
739
740                 hostname_cleanup(hn);
741
742                 if (!isempty(hn))
743                         if (sethostname(hn, strlen(hn)) < 0)
744                                 r = -errno;
745
746                 free(hn);
747         }
748
749         return r;
750 }
751
752 static int setup_journal(const char *directory) {
753         sd_id128_t machine_id;
754         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
755         char *id;
756         int r;
757
758         if (arg_link_journal == LINK_NO)
759                 return 0;
760
761         p = strappend(directory, "/etc/machine-id");
762         if (!p)
763                 return log_oom();
764
765         r = read_one_line_file(p, &b);
766         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
767                 return 0;
768         else if (r < 0) {
769                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
770                 return r;
771         }
772
773         id = strstrip(b);
774         if (isempty(id) && arg_link_journal == LINK_AUTO)
775                 return 0;
776
777         /* Verify validity */
778         r = sd_id128_from_string(id, &machine_id);
779         if (r < 0) {
780                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
781                 return r;
782         }
783
784         free(p);
785         p = strappend("/var/log/journal/", id);
786         q = strjoin(directory, "/var/log/journal/", id, NULL);
787         if (!p || !q)
788                 return log_oom();
789
790         if (path_is_mount_point(p, false) > 0) {
791                 if (arg_link_journal != LINK_AUTO) {
792                         log_error("%s: already a mount point, refusing to use for journal", p);
793                         return -EEXIST;
794                 }
795
796                 return 0;
797         }
798
799         if (path_is_mount_point(q, false) > 0) {
800                 if (arg_link_journal != LINK_AUTO) {
801                         log_error("%s: already a mount point, refusing to use for journal", q);
802                         return -EEXIST;
803                 }
804
805                 return 0;
806         }
807
808         r = readlink_and_make_absolute(p, &d);
809         if (r >= 0) {
810                 if ((arg_link_journal == LINK_GUEST ||
811                      arg_link_journal == LINK_AUTO) &&
812                     path_equal(d, q)) {
813
814                         r = mkdir_p(q, 0755);
815                         if (r < 0)
816                                 log_warning("failed to create directory %s: %m", q);
817                         return 0;
818                 }
819
820                 if (unlink(p) < 0) {
821                         log_error("Failed to remove symlink %s: %m", p);
822                         return -errno;
823                 }
824         } else if (r == -EINVAL) {
825
826                 if (arg_link_journal == LINK_GUEST &&
827                     rmdir(p) < 0) {
828
829                         if (errno == ENOTDIR) {
830                                 log_error("%s already exists and is neither a symlink nor a directory", p);
831                                 return r;
832                         } else {
833                                 log_error("Failed to remove %s: %m", p);
834                                 return -errno;
835                         }
836                 }
837         } else if (r != -ENOENT) {
838                 log_error("readlink(%s) failed: %m", p);
839                 return r;
840         }
841
842         if (arg_link_journal == LINK_GUEST) {
843
844                 if (symlink(q, p) < 0) {
845                         log_error("Failed to symlink %s to %s: %m", q, p);
846                         return -errno;
847                 }
848
849                 r = mkdir_p(q, 0755);
850                 if (r < 0)
851                         log_warning("failed to create directory %s: %m", q);
852                 return 0;
853         }
854
855         if (arg_link_journal == LINK_HOST) {
856                 r = mkdir_p(p, 0755);
857                 if (r < 0) {
858                         log_error("Failed to create %s: %m", p);
859                         return r;
860                 }
861
862         } else if (access(p, F_OK) < 0)
863                 return 0;
864
865         if (dir_is_empty(q) == 0) {
866                 log_error("%s not empty.", q);
867                 return -ENOTEMPTY;
868         }
869
870         r = mkdir_p(q, 0755);
871         if (r < 0) {
872                 log_error("Failed to create %s: %m", q);
873                 return r;
874         }
875
876         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
877                 log_error("Failed to bind mount journal from host into guest: %m");
878                 return -errno;
879         }
880
881         return 0;
882 }
883
884 static int drop_capabilities(void) {
885         return capability_bounding_set_drop(~arg_retain, false);
886 }
887
888 static int is_os_tree(const char *path) {
889         int r;
890         char *p;
891         /* We use /bin/sh as flag file if something is an OS */
892
893         if (asprintf(&p, "%s/bin/sh", path) < 0)
894                 return -ENOMEM;
895
896         r = access(p, F_OK);
897         free(p);
898
899         return r < 0 ? 0 : 1;
900 }
901
902 static int process_pty(int master, pid_t pid, sigset_t *mask) {
903
904         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
905         size_t in_buffer_full = 0, out_buffer_full = 0;
906         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
907         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
908         int ep = -1, signal_fd = -1, r;
909         bool tried_orderly_shutdown = false;
910
911         assert(master >= 0);
912         assert(pid > 0);
913         assert(mask);
914
915         fd_nonblock(STDIN_FILENO, 1);
916         fd_nonblock(STDOUT_FILENO, 1);
917         fd_nonblock(master, 1);
918
919         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
920         if (signal_fd < 0) {
921                 log_error("signalfd(): %m");
922                 r = -errno;
923                 goto finish;
924         }
925
926         ep = epoll_create1(EPOLL_CLOEXEC);
927         if (ep < 0) {
928                 log_error("Failed to create epoll: %m");
929                 r = -errno;
930                 goto finish;
931         }
932
933         /* We read from STDIN only if this is actually a TTY,
934          * otherwise we assume non-interactivity. */
935         if (isatty(STDIN_FILENO)) {
936                 zero(stdin_ev);
937                 stdin_ev.events = EPOLLIN|EPOLLET;
938                 stdin_ev.data.fd = STDIN_FILENO;
939
940                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
941                         log_error("Failed to register STDIN in epoll: %m");
942                         r = -errno;
943                         goto finish;
944                 }
945         }
946
947         zero(stdout_ev);
948         stdout_ev.events = EPOLLOUT|EPOLLET;
949         stdout_ev.data.fd = STDOUT_FILENO;
950
951         zero(master_ev);
952         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
953         master_ev.data.fd = master;
954
955         zero(signal_ev);
956         signal_ev.events = EPOLLIN;
957         signal_ev.data.fd = signal_fd;
958
959         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
960                 if (errno != EPERM) {
961                         log_error("Failed to register stdout in epoll: %m");
962                         r = -errno;
963                         goto finish;
964                 }
965                 /* stdout without epoll support. Likely redirected to regular file. */
966                 stdout_writable = true;
967         }
968
969         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
970             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
971                 log_error("Failed to register fds in epoll: %m");
972                 r = -errno;
973                 goto finish;
974         }
975
976         for (;;) {
977                 struct epoll_event ev[16];
978                 ssize_t k;
979                 int i, nfds;
980
981                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
982                 if (nfds < 0) {
983
984                         if (errno == EINTR || errno == EAGAIN)
985                                 continue;
986
987                         log_error("epoll_wait(): %m");
988                         r = -errno;
989                         goto finish;
990                 }
991
992                 assert(nfds >= 1);
993
994                 for (i = 0; i < nfds; i++) {
995                         if (ev[i].data.fd == STDIN_FILENO) {
996
997                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
998                                         stdin_readable = true;
999
1000                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1001
1002                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1003                                         stdout_writable = true;
1004
1005                         } else if (ev[i].data.fd == master) {
1006
1007                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1008                                         master_readable = true;
1009
1010                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1011                                         master_writable = true;
1012
1013                         } else if (ev[i].data.fd == signal_fd) {
1014                                 struct signalfd_siginfo sfsi;
1015                                 ssize_t n;
1016
1017                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1018                                 if (n != sizeof(sfsi)) {
1019
1020                                         if (n >= 0) {
1021                                                 log_error("Failed to read from signalfd: invalid block size");
1022                                                 r = -EIO;
1023                                                 goto finish;
1024                                         }
1025
1026                                         if (errno != EINTR && errno != EAGAIN) {
1027                                                 log_error("Failed to read from signalfd: %m");
1028                                                 r = -errno;
1029                                                 goto finish;
1030                                         }
1031                                 } else {
1032
1033                                         if (sfsi.ssi_signo == SIGWINCH) {
1034                                                 struct winsize ws;
1035
1036                                                 /* The window size changed, let's forward that. */
1037                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1038                                                         ioctl(master, TIOCSWINSZ, &ws);
1039                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1040
1041                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1042
1043                                                 /* This only works for systemd... */
1044                                                 tried_orderly_shutdown = true;
1045                                                 kill(pid, SIGRTMIN+3);
1046
1047                                         } else {
1048                                                 r = 0;
1049                                                 goto finish;
1050                                         }
1051                                 }
1052                         }
1053                 }
1054
1055                 while ((stdin_readable && in_buffer_full <= 0) ||
1056                        (master_writable && in_buffer_full > 0) ||
1057                        (master_readable && out_buffer_full <= 0) ||
1058                        (stdout_writable && out_buffer_full > 0)) {
1059
1060                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1061
1062                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1063                                 if (k < 0) {
1064
1065                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1066                                                 stdin_readable = false;
1067                                         else {
1068                                                 log_error("read(): %m");
1069                                                 r = -errno;
1070                                                 goto finish;
1071                                         }
1072                                 } else
1073                                         in_buffer_full += (size_t) k;
1074                         }
1075
1076                         if (master_writable && in_buffer_full > 0) {
1077
1078                                 k = write(master, in_buffer, in_buffer_full);
1079                                 if (k < 0) {
1080
1081                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1082                                                 master_writable = false;
1083                                         else {
1084                                                 log_error("write(): %m");
1085                                                 r = -errno;
1086                                                 goto finish;
1087                                         }
1088
1089                                 } else {
1090                                         assert(in_buffer_full >= (size_t) k);
1091                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1092                                         in_buffer_full -= k;
1093                                 }
1094                         }
1095
1096                         if (master_readable && out_buffer_full < LINE_MAX) {
1097
1098                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1099                                 if (k < 0) {
1100
1101                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1102                                                 master_readable = false;
1103                                         else {
1104                                                 log_error("read(): %m");
1105                                                 r = -errno;
1106                                                 goto finish;
1107                                         }
1108                                 }  else
1109                                         out_buffer_full += (size_t) k;
1110                         }
1111
1112                         if (stdout_writable && out_buffer_full > 0) {
1113
1114                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1115                                 if (k < 0) {
1116
1117                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1118                                                 stdout_writable = false;
1119                                         else {
1120                                                 log_error("write(): %m");
1121                                                 r = -errno;
1122                                                 goto finish;
1123                                         }
1124
1125                                 } else {
1126                                         assert(out_buffer_full >= (size_t) k);
1127                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1128                                         out_buffer_full -= k;
1129                                 }
1130                         }
1131                 }
1132         }
1133
1134 finish:
1135         if (ep >= 0)
1136                 close_nointr_nofail(ep);
1137
1138         if (signal_fd >= 0)
1139                 close_nointr_nofail(signal_fd);
1140
1141         return r;
1142 }
1143
1144 int main(int argc, char *argv[]) {
1145         pid_t pid = 0;
1146         int r = EXIT_FAILURE, k;
1147         char *oldcg = NULL, *newcg = NULL;
1148         char **controller = NULL;
1149         int master = -1, n_fd_passed;
1150         const char *console = NULL;
1151         struct termios saved_attr, raw_attr;
1152         sigset_t mask;
1153         bool saved_attr_valid = false;
1154         struct winsize ws;
1155         int kmsg_socket_pair[2] = { -1, -1 };
1156         FDSet *fds = NULL;
1157
1158         log_parse_environment();
1159         log_open();
1160
1161         r = parse_argv(argc, argv);
1162         if (r <= 0)
1163                 goto finish;
1164
1165         if (arg_directory) {
1166                 char *p;
1167
1168                 p = path_make_absolute_cwd(arg_directory);
1169                 free(arg_directory);
1170                 arg_directory = p;
1171         } else
1172                 arg_directory = get_current_dir_name();
1173
1174         if (!arg_directory) {
1175                 log_error("Failed to determine path");
1176                 goto finish;
1177         }
1178
1179         path_kill_slashes(arg_directory);
1180
1181         if (geteuid() != 0) {
1182                 log_error("Need to be root.");
1183                 goto finish;
1184         }
1185
1186         if (sd_booted() <= 0) {
1187                 log_error("Not running on a systemd system.");
1188                 goto finish;
1189         }
1190
1191         if (path_equal(arg_directory, "/")) {
1192                 log_error("Spawning container on root directory not supported.");
1193                 goto finish;
1194         }
1195
1196         if (is_os_tree(arg_directory) <= 0) {
1197                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1198                 goto finish;
1199         }
1200
1201         log_close();
1202         n_fd_passed = sd_listen_fds(false);
1203         if (n_fd_passed > 0) {
1204                 k = fdset_new_listen_fds(&fds, false);
1205                 if (k < 0) {
1206                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1207                         goto finish;
1208                 }
1209         }
1210         fdset_close_others(fds);
1211         log_open();
1212
1213         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1214         if (k < 0) {
1215                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1216                 goto finish;
1217         }
1218
1219         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1220                 log_error("Failed to allocate cgroup path.");
1221                 goto finish;
1222         }
1223
1224         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1225         if (k < 0)  {
1226                 log_error("Failed to create cgroup: %s", strerror(-k));
1227                 goto finish;
1228         }
1229
1230         STRV_FOREACH(controller, arg_controllers) {
1231                 k = cg_create_and_attach(*controller, newcg, 0);
1232                 if (k < 0)
1233                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1234         }
1235
1236         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1237         if (master < 0) {
1238                 log_error("Failed to acquire pseudo tty: %m");
1239                 goto finish;
1240         }
1241
1242         console = ptsname(master);
1243         if (!console) {
1244                 log_error("Failed to determine tty name: %m");
1245                 goto finish;
1246         }
1247
1248         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1249
1250         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1251                 ioctl(master, TIOCSWINSZ, &ws);
1252
1253         if (unlockpt(master) < 0) {
1254                 log_error("Failed to unlock tty: %m");
1255                 goto finish;
1256         }
1257
1258         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1259                 saved_attr_valid = true;
1260
1261                 raw_attr = saved_attr;
1262                 cfmakeraw(&raw_attr);
1263                 raw_attr.c_lflag &= ~ECHO;
1264         }
1265
1266         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1267                 log_error("Failed to create kmsg socket pair");
1268                 goto finish;
1269         }
1270
1271         assert_se(sigemptyset(&mask) == 0);
1272         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1273         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1274
1275         for (;;) {
1276                 siginfo_t status;
1277                 int pipefd[2];
1278
1279                 if(pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1280                         log_error("pipe2(): %m");
1281                         goto finish;
1282                 }
1283
1284                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1285                 if (pid < 0) {
1286                         if (errno == EINVAL)
1287                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1288                         else
1289                                 log_error("clone() failed: %m");
1290
1291                         goto finish;
1292                 }
1293
1294                 if (pid == 0) {
1295                         /* child */
1296                         const char *home = NULL;
1297                         uid_t uid = (uid_t) -1;
1298                         gid_t gid = (gid_t) -1;
1299                         unsigned n_env = 2;
1300                         const char *envp[] = {
1301                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1302                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1303                                 NULL, /* TERM */
1304                                 NULL, /* HOME */
1305                                 NULL, /* USER */
1306                                 NULL, /* LOGNAME */
1307                                 NULL, /* container_uuid */
1308                                 NULL, /* LISTEN_FDS */
1309                                 NULL, /* LISTEN_PID */
1310                                 NULL
1311                         };
1312
1313                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1314                         if (envp[n_env])
1315                                 n_env ++;
1316
1317                         close_nointr_nofail(pipefd[1]);
1318                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1319                         close_nointr_nofail(pipefd[0]);
1320
1321                         close_nointr_nofail(master);
1322                         master = -1;
1323
1324                         if (saved_attr_valid) {
1325                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1326                                         log_error("Failed to set terminal attributes: %m");
1327                                         goto child_fail;
1328                                 }
1329                         }
1330
1331                         close_nointr(STDIN_FILENO);
1332                         close_nointr(STDOUT_FILENO);
1333                         close_nointr(STDERR_FILENO);
1334
1335                         close_nointr_nofail(kmsg_socket_pair[0]);
1336                         kmsg_socket_pair[0] = -1;
1337
1338                         reset_all_signal_handlers();
1339
1340                         assert_se(sigemptyset(&mask) == 0);
1341                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1342
1343                         k = open_terminal(console, O_RDWR);
1344                         if (k != STDIN_FILENO) {
1345                                 if (k >= 0) {
1346                                         close_nointr_nofail(k);
1347                                         k = -EINVAL;
1348                                 }
1349
1350                                 log_error("Failed to open console: %s", strerror(-k));
1351                                 goto child_fail;
1352                         }
1353
1354                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1355                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1356                                 log_error("Failed to duplicate console: %m");
1357                                 goto child_fail;
1358                         }
1359
1360                         if (setsid() < 0) {
1361                                 log_error("setsid() failed: %m");
1362                                 goto child_fail;
1363                         }
1364
1365                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1366                                 log_error("PR_SET_PDEATHSIG failed: %m");
1367                                 goto child_fail;
1368                         }
1369
1370                         /* Mark everything as slave, so that we still
1371                          * receive mounts from the real root, but don't
1372                          * propagate mounts to the real root. */
1373                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1374                                 log_error("MS_SLAVE|MS_REC failed: %m");
1375                                 goto child_fail;
1376                         }
1377
1378                         /* Turn directory into bind mount */
1379                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1380                                 log_error("Failed to make bind mount.");
1381                                 goto child_fail;
1382                         }
1383
1384                         if (arg_read_only)
1385                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1386                                         log_error("Failed to make read-only.");
1387                                         goto child_fail;
1388                                 }
1389
1390                         if (mount_all(arg_directory) < 0)
1391                                 goto child_fail;
1392
1393                         if (copy_devnodes(arg_directory) < 0)
1394                                 goto child_fail;
1395
1396                         dev_setup(arg_directory);
1397
1398                         if (setup_dev_console(arg_directory, console) < 0)
1399                                 goto child_fail;
1400
1401                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1402                                 goto child_fail;
1403
1404                         close_nointr_nofail(kmsg_socket_pair[1]);
1405                         kmsg_socket_pair[1] = -1;
1406
1407                         if (setup_boot_id(arg_directory) < 0)
1408                                 goto child_fail;
1409
1410                         if (setup_timezone(arg_directory) < 0)
1411                                 goto child_fail;
1412
1413                         if (setup_resolv_conf(arg_directory) < 0)
1414                                 goto child_fail;
1415
1416                         if (setup_journal(arg_directory) < 0)
1417                                 goto child_fail;
1418
1419                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1420                                 goto child_fail;
1421
1422                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1423                                 goto child_fail;
1424
1425                         if (chdir(arg_directory) < 0) {
1426                                 log_error("chdir(%s) failed: %m", arg_directory);
1427                                 goto child_fail;
1428                         }
1429
1430                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1431                                 log_error("mount(MS_MOVE) failed: %m");
1432                                 goto child_fail;
1433                         }
1434
1435                         if (chroot(".") < 0) {
1436                                 log_error("chroot() failed: %m");
1437                                 goto child_fail;
1438                         }
1439
1440                         if (chdir("/") < 0) {
1441                                 log_error("chdir() failed: %m");
1442                                 goto child_fail;
1443                         }
1444
1445                         umask(0022);
1446
1447                         loopback_setup();
1448
1449                         if (drop_capabilities() < 0) {
1450                                 log_error("drop_capabilities() failed: %m");
1451                                 goto child_fail;
1452                         }
1453
1454                         if (arg_user) {
1455
1456                                 /* Note that this resolves user names
1457                                  * inside the container, and hence
1458                                  * accesses the NSS modules from the
1459                                  * container and not the host. This is
1460                                  * a bit weird... */
1461
1462                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1463                                         log_error("get_user_creds() failed: %m");
1464                                         goto child_fail;
1465                                 }
1466
1467                                 if (mkdir_parents_label(home, 0775) < 0) {
1468                                         log_error("mkdir_parents_label() failed: %m");
1469                                         goto child_fail;
1470                                 }
1471
1472                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1473                                         log_error("mkdir_safe_label() failed: %m");
1474                                         goto child_fail;
1475                                 }
1476
1477                                 if (initgroups((const char*)arg_user, gid) < 0) {
1478                                         log_error("initgroups() failed: %m");
1479                                         goto child_fail;
1480                                 }
1481
1482                                 if (setresgid(gid, gid, gid) < 0) {
1483                                         log_error("setregid() failed: %m");
1484                                         goto child_fail;
1485                                 }
1486
1487                                 if (setresuid(uid, uid, uid) < 0) {
1488                                         log_error("setreuid() failed: %m");
1489                                         goto child_fail;
1490                                 }
1491                         } else {
1492                                 /* Reset everything fully to 0, just in case */
1493
1494                                 if (setgroups(0, NULL) < 0) {
1495                                         log_error("setgroups() failed: %m");
1496                                         goto child_fail;
1497                                 }
1498
1499                                 if (setresgid(0, 0, 0) < 0) {
1500                                         log_error("setregid() failed: %m");
1501                                         goto child_fail;
1502                                 }
1503
1504                                 if (setresuid(0, 0, 0) < 0) {
1505                                         log_error("setreuid() failed: %m");
1506                                         goto child_fail;
1507                                 }
1508                         }
1509
1510                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1511                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1512                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1513                                 log_oom();
1514                                 goto child_fail;
1515                         }
1516
1517                         if (arg_uuid) {
1518                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1519                                         log_oom();
1520                                         goto child_fail;
1521                                 }
1522                         }
1523
1524                         if (fdset_size(fds) > 0) {
1525                                 k = fdset_cloexec(fds, false);
1526                                 if (k < 0) {
1527                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1528                                         goto child_fail;
1529                                 }
1530
1531                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1532                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1533                                         log_oom();
1534                                         goto child_fail;
1535                                 }
1536                         }
1537
1538                         setup_hostname();
1539
1540                         if (arg_boot) {
1541                                 char **a;
1542                                 size_t l;
1543
1544                                 /* Automatically search for the init system */
1545
1546                                 l = 1 + argc - optind;
1547                                 a = newa(char*, l + 1);
1548                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1549
1550                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1551                                 execve(a[0], a, (char**) envp);
1552
1553                                 a[0] = (char*) "/lib/systemd/systemd";
1554                                 execve(a[0], a, (char**) envp);
1555
1556                                 a[0] = (char*) "/sbin/init";
1557                                 execve(a[0], a, (char**) envp);
1558                         } else if (argc > optind)
1559                                 execvpe(argv[optind], argv + optind, (char**) envp);
1560                         else {
1561                                 chdir(home ? home : "/root");
1562                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1563                         }
1564
1565                         log_error("execv() failed: %m");
1566
1567                 child_fail:
1568                         _exit(EXIT_FAILURE);
1569                 }
1570
1571                 log_info("Init process in the container running as PID %d", pid);
1572                 close_nointr_nofail(pipefd[0]);
1573                 close_nointr_nofail(pipefd[1]);
1574
1575                 fdset_free(fds);
1576                 fds = NULL;
1577
1578                 if (process_pty(master, pid, &mask) < 0)
1579                         goto finish;
1580
1581                 if (saved_attr_valid)
1582                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1583
1584                 r = wait_for_terminate(pid, &status);
1585                 if (r < 0) {
1586                         r = EXIT_FAILURE;
1587                         break;
1588                 }
1589
1590                 if (status.si_code == CLD_EXITED) {
1591                         if (status.si_status != 0) {
1592                                 log_error("Container failed with error code %i.", status.si_status);
1593                                 r = status.si_status;
1594                                 break;
1595                         }
1596
1597                         log_debug("Container exited successfully.");
1598                         break;
1599                 } else if (status.si_code == CLD_KILLED &&
1600                            status.si_status == SIGINT) {
1601                         log_info("Container has been shut down.");
1602                         r = 0;
1603                         break;
1604                 } else if (status.si_code == CLD_KILLED &&
1605                            status.si_status == SIGHUP) {
1606                         log_info("Container is being rebooted.");
1607                         continue;
1608                 } else if (status.si_code == CLD_KILLED ||
1609                            status.si_code == CLD_DUMPED) {
1610
1611                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1612                         r = EXIT_FAILURE;
1613                         break;
1614                 } else {
1615                         log_error("Container failed due to unknown reason.");
1616                         r = EXIT_FAILURE;
1617                         break;
1618                 }
1619         }
1620
1621 finish:
1622         if (saved_attr_valid)
1623                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1624
1625         if (master >= 0)
1626                 close_nointr_nofail(master);
1627
1628         close_pipe(kmsg_socket_pair);
1629
1630         if (oldcg)
1631                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1632
1633         if (newcg)
1634                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1635
1636         free(arg_directory);
1637         strv_free(arg_controllers);
1638         free(oldcg);
1639         free(newcg);
1640
1641         fdset_free(fds);
1642
1643         return r;
1644 }