chiark / gitweb /
util: rename write_one_line_file() to write_string_file()
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 #ifndef TTY_GID
64 #define TTY_GID 5
65 #endif
66
67 typedef enum LinkJournal {
68         LINK_NO,
69         LINK_AUTO,
70         LINK_HOST,
71         LINK_GUEST
72 } LinkJournal;
73
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static bool arg_private_network = false;
79 static bool arg_read_only = false;
80 static bool arg_boot = false;
81 static LinkJournal arg_link_journal = LINK_AUTO;
82 static uint64_t arg_retain =
83         (1ULL << CAP_CHOWN) |
84         (1ULL << CAP_DAC_OVERRIDE) |
85         (1ULL << CAP_DAC_READ_SEARCH) |
86         (1ULL << CAP_FOWNER) |
87         (1ULL << CAP_FSETID) |
88         (1ULL << CAP_IPC_OWNER) |
89         (1ULL << CAP_KILL) |
90         (1ULL << CAP_LEASE) |
91         (1ULL << CAP_LINUX_IMMUTABLE) |
92         (1ULL << CAP_NET_BIND_SERVICE) |
93         (1ULL << CAP_NET_BROADCAST) |
94         (1ULL << CAP_NET_RAW) |
95         (1ULL << CAP_SETGID) |
96         (1ULL << CAP_SETFCAP) |
97         (1ULL << CAP_SETPCAP) |
98         (1ULL << CAP_SETUID) |
99         (1ULL << CAP_SYS_ADMIN) |
100         (1ULL << CAP_SYS_CHROOT) |
101         (1ULL << CAP_SYS_NICE) |
102         (1ULL << CAP_SYS_PTRACE) |
103         (1ULL << CAP_SYS_TTY_CONFIG) |
104         (1ULL << CAP_SYS_RESOURCE) |
105         (1ULL << CAP_SYS_BOOT) |
106         (1ULL << CAP_AUDIT_WRITE) |
107         (1ULL << CAP_AUDIT_CONTROL);
108 static char **arg_bind = NULL;
109 static char **arg_bind_ro = NULL;
110
111 static int help(void) {
112
113         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
114                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
115                "  -h --help                Show this help\n"
116                "  --version                Print version string\n"
117                "  -D --directory=NAME      Root directory for the container\n"
118                "  -b --boot                Boot up full system (i.e. invoke init)\n"
119                "  -u --user=USER           Run the command under specified user or uid\n"
120                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
121                "                           cgroup hierarchies\n"
122                "     --uuid=UUID           Set a specific machine UUID for the container\n"
123                "     --private-network     Disable network in container\n"
124                "     --read-only           Mount the root directory read-only\n"
125                "     --capability=CAP      In addition to the default, retain specified\n"
126                "                           capability\n"
127                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
128                "  -j                       Equivalent to --link-journal=host\n"
129                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
130                "                           the container\n"
131                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
132                program_invocation_short_name);
133
134         return 0;
135 }
136
137 static int parse_argv(int argc, char *argv[]) {
138
139         enum {
140                 ARG_VERSION = 0x100,
141                 ARG_PRIVATE_NETWORK,
142                 ARG_UUID,
143                 ARG_READ_ONLY,
144                 ARG_CAPABILITY,
145                 ARG_LINK_JOURNAL,
146                 ARG_BIND,
147                 ARG_BIND_RO
148         };
149
150         static const struct option options[] = {
151                 { "help",            no_argument,       NULL, 'h'                 },
152                 { "version",         no_argument,       NULL, ARG_VERSION         },
153                 { "directory",       required_argument, NULL, 'D'                 },
154                 { "user",            required_argument, NULL, 'u'                 },
155                 { "controllers",     required_argument, NULL, 'C'                 },
156                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
157                 { "boot",            no_argument,       NULL, 'b'                 },
158                 { "uuid",            required_argument, NULL, ARG_UUID            },
159                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
160                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
161                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
162                 { "bind",            required_argument, NULL, ARG_BIND            },
163                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
164                 { NULL,              0,                 NULL, 0                   }
165         };
166
167         int c;
168
169         assert(argc >= 0);
170         assert(argv);
171
172         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
173
174                 switch (c) {
175
176                 case 'h':
177                         help();
178                         return 0;
179
180                 case ARG_VERSION:
181                         puts(PACKAGE_STRING);
182                         puts(SYSTEMD_FEATURES);
183                         return 0;
184
185                 case 'D':
186                         free(arg_directory);
187                         arg_directory = canonicalize_file_name(optarg);
188                         if (!arg_directory) {
189                                 log_error("Failed to canonicalize root directory.");
190                                 return -ENOMEM;
191                         }
192
193                         break;
194
195                 case 'u':
196                         free(arg_user);
197                         if (!(arg_user = strdup(optarg))) {
198                                 log_error("Failed to duplicate user name.");
199                                 return -ENOMEM;
200                         }
201
202                         break;
203
204                 case 'C':
205                         strv_free(arg_controllers);
206                         arg_controllers = strv_split(optarg, ",");
207                         if (!arg_controllers) {
208                                 log_error("Failed to split controllers list.");
209                                 return -ENOMEM;
210                         }
211                         strv_uniq(arg_controllers);
212
213                         break;
214
215                 case ARG_PRIVATE_NETWORK:
216                         arg_private_network = true;
217                         break;
218
219                 case 'b':
220                         arg_boot = true;
221                         break;
222
223                 case ARG_UUID:
224                         arg_uuid = optarg;
225                         break;
226
227                 case ARG_READ_ONLY:
228                         arg_read_only = true;
229                         break;
230
231                 case ARG_CAPABILITY: {
232                         char *state, *word;
233                         size_t length;
234
235                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
236                                 cap_value_t cap;
237                                 char *t;
238
239                                 t = strndup(word, length);
240                                 if (!t)
241                                         return log_oom();
242
243                                 if (cap_from_name(t, &cap) < 0) {
244                                         log_error("Failed to parse capability %s.", t);
245                                         free(t);
246                                         return -EINVAL;
247                                 }
248
249                                 free(t);
250                                 arg_retain |= 1ULL << (uint64_t) cap;
251                         }
252
253                         break;
254                 }
255
256                 case 'j':
257                         arg_link_journal = LINK_GUEST;
258                         break;
259
260                 case ARG_LINK_JOURNAL:
261                         if (streq(optarg, "auto"))
262                                 arg_link_journal = LINK_AUTO;
263                         else if (streq(optarg, "no"))
264                                 arg_link_journal = LINK_NO;
265                         else if (streq(optarg, "guest"))
266                                 arg_link_journal = LINK_GUEST;
267                         else if (streq(optarg, "host"))
268                                 arg_link_journal = LINK_HOST;
269                         else {
270                                 log_error("Failed to parse link journal mode %s", optarg);
271                                 return -EINVAL;
272                         }
273
274                         break;
275
276                 case ARG_BIND:
277                 case ARG_BIND_RO: {
278                         _cleanup_free_ char *a = NULL, *b = NULL;
279                         char *e;
280                         char ***x;
281                         int r;
282
283                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
284
285                         e = strchr(optarg, ':');
286                         if (e) {
287                                 a = strndup(optarg, e - optarg);
288                                 b = strdup(e + 1);
289                         } else {
290                                 a = strdup(optarg);
291                                 b = strdup(optarg);
292                         }
293
294                         if (!a || !b)
295                                 return log_oom();
296
297                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
298                                 log_error("Invalid bind mount specification: %s", optarg);
299                                 return -EINVAL;
300                         }
301
302                         r = strv_extend(x, a);
303                         if (r < 0)
304                                 return r;
305
306                         r = strv_extend(x, b);
307                         if (r < 0)
308                                 return r;
309
310                         break;
311                 }
312
313                 case '?':
314                         return -EINVAL;
315
316                 default:
317                         log_error("Unknown option code %c", c);
318                         return -EINVAL;
319                 }
320         }
321
322         return 1;
323 }
324
325 static int mount_all(const char *dest) {
326
327         typedef struct MountPoint {
328                 const char *what;
329                 const char *where;
330                 const char *type;
331                 const char *options;
332                 unsigned long flags;
333                 bool fatal;
334         } MountPoint;
335
336         static const MountPoint mount_table[] = {
337                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
338                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
339                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
340                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
341                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
342                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
343                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
344                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
345 #ifdef HAVE_SELINUX
346                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
347                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
348 #endif
349         };
350
351         unsigned k;
352         int r = 0;
353
354         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
355                 char _cleanup_free_ *where = NULL;
356                 int t;
357
358                 where = strjoin(dest, "/", mount_table[k].where, NULL);
359                 if (!where)
360                         return log_oom();
361
362                 t = path_is_mount_point(where, true);
363                 if (t < 0) {
364                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
365
366                         if (r == 0)
367                                 r = t;
368
369                         continue;
370                 }
371
372                 /* Skip this entry if it is not a remount. */
373                 if (mount_table[k].what && t > 0)
374                         continue;
375
376                 mkdir_p(where, 0755);
377
378                 if (mount(mount_table[k].what,
379                           where,
380                           mount_table[k].type,
381                           mount_table[k].flags,
382                           mount_table[k].options) < 0 &&
383                     mount_table[k].fatal) {
384
385                         log_error("mount(%s) failed: %m", where);
386
387                         if (r == 0)
388                                 r = -errno;
389                 }
390         }
391
392         return r;
393 }
394
395 static int mount_binds(const char *dest, char **l, unsigned long flags) {
396         char **x, **y;
397
398         STRV_FOREACH_PAIR(x, y, l) {
399                 _cleanup_free_ char *where = NULL;
400
401                 where = strjoin(dest, "/", *y, NULL);
402                 if (!where)
403                         return log_oom();
404
405                 mkdir_p_label(where, 0755);
406
407                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
408                         log_error("mount(%s) failed: %m", where);
409                         return -errno;
410                 }
411
412                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
413                         log_error("mount(%s) failed: %m", where);
414                         return -errno;
415                 }
416         }
417
418         return 0;
419 }
420
421 static int setup_timezone(const char *dest) {
422         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
423         char *z, *y;
424         int r;
425
426         assert(dest);
427
428         /* Fix the timezone, if possible */
429         r = readlink_malloc("/etc/localtime", &p);
430         if (r < 0) {
431                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
432                 return 0;
433         }
434
435         z = path_startswith(p, "../usr/share/zoneinfo/");
436         if (!z)
437                 z = path_startswith(p, "/usr/share/zoneinfo/");
438         if (!z) {
439                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
440                 return 0;
441         }
442
443         where = strappend(dest, "/etc/localtime");
444         if (!where)
445                 return log_oom();
446
447         r = readlink_malloc(where, &q);
448         if (r >= 0) {
449                 y = path_startswith(q, "../usr/share/zoneinfo/");
450                 if (!y)
451                         y = path_startswith(q, "/usr/share/zoneinfo/");
452
453
454                 /* Already pointing to the right place? Then do nothing .. */
455                 if (y && streq(y, z))
456                         return 0;
457         }
458
459         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
460         if (!check)
461                 return log_oom();
462
463         if (access(check, F_OK) < 0) {
464                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
465                 return 0;
466         }
467
468         what = strappend("../usr/share/zoneinfo/", z);
469         if (!what)
470                 return log_oom();
471
472         unlink(where);
473         if (symlink(what, where) < 0) {
474                 log_error("Failed to correct timezone of container: %m");
475                 return 0;
476         }
477
478         return 0;
479 }
480
481 static int setup_resolv_conf(const char *dest) {
482         char *where;
483
484         assert(dest);
485
486         if (arg_private_network)
487                 return 0;
488
489         /* Fix resolv.conf, if possible */
490         where = strappend(dest, "/etc/resolv.conf");
491         if (!where)
492                 return log_oom();
493
494         /* We don't really care for the results of this really. If it
495          * fails, it fails, but meh... */
496         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
497                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
498
499         free(where);
500
501         return 0;
502 }
503
504 static int setup_boot_id(const char *dest) {
505         char _cleanup_free_ *from = NULL, *to = NULL;
506         sd_id128_t rnd;
507         char as_uuid[37];
508         int r;
509
510         assert(dest);
511
512         /* Generate a new randomized boot ID, so that each boot-up of
513          * the container gets a new one */
514
515         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
516         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
517         if (!from || !to)
518                 return log_oom();
519
520         r = sd_id128_randomize(&rnd);
521         if (r < 0) {
522                 log_error("Failed to generate random boot id: %s", strerror(-r));
523                 return r;
524         }
525
526         snprintf(as_uuid, sizeof(as_uuid),
527                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
528                  SD_ID128_FORMAT_VAL(rnd));
529         char_array_0(as_uuid);
530
531         r = write_string_file(from, as_uuid);
532         if (r < 0) {
533                 log_error("Failed to write boot id: %s", strerror(-r));
534                 return r;
535         }
536
537         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
538                 log_error("Failed to bind mount boot id: %m");
539                 r = -errno;
540         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
541                 log_warning("Failed to make boot id read-only: %m");
542
543         unlink(from);
544         return r;
545 }
546
547 static int copy_devnodes(const char *dest) {
548
549         static const char devnodes[] =
550                 "null\0"
551                 "zero\0"
552                 "full\0"
553                 "random\0"
554                 "urandom\0"
555                 "tty\0";
556
557         const char *d;
558         int r = 0;
559         mode_t _cleanup_umask_ u;
560
561         assert(dest);
562
563         u = umask(0000);
564
565         NULSTR_FOREACH(d, devnodes) {
566                 struct stat st;
567                 char _cleanup_free_ *from = NULL, *to = NULL;
568
569                 asprintf(&from, "/dev/%s", d);
570                 asprintf(&to, "%s/dev/%s", dest, d);
571
572                 if (!from || !to) {
573                         log_oom();
574
575                         if (r == 0)
576                                 r = -ENOMEM;
577
578                         break;
579                 }
580
581                 if (stat(from, &st) < 0) {
582
583                         if (errno != ENOENT) {
584                                 log_error("Failed to stat %s: %m", from);
585                                 if (r == 0)
586                                         r = -errno;
587                         }
588
589                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
590
591                         log_error("%s is not a char or block device, cannot copy", from);
592                         if (r == 0)
593                                 r = -EIO;
594
595                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
596
597                         log_error("mknod(%s) failed: %m", dest);
598                         if (r == 0)
599                                 r = -errno;
600                 }
601         }
602
603         return r;
604 }
605
606 static int setup_ptmx(const char *dest) {
607         _cleanup_free_ char *p = NULL;
608
609         p = strappend(dest, "/dev/ptmx");
610         if (!p)
611                 return log_oom();
612
613         if (symlink("pts/ptmx", p) < 0) {
614                 log_error("Failed to create /dev/ptmx symlink: %m");
615                 return -errno;
616         }
617
618         return 0;
619 }
620
621 static int setup_dev_console(const char *dest, const char *console) {
622         struct stat st;
623         char _cleanup_free_ *to = NULL;
624         int r;
625         mode_t _cleanup_umask_ u;
626
627         assert(dest);
628         assert(console);
629
630         u = umask(0000);
631
632         if (stat(console, &st) < 0) {
633                 log_error("Failed to stat %s: %m", console);
634                 return -errno;
635
636         } else if (!S_ISCHR(st.st_mode)) {
637                 log_error("/dev/console is not a char device");
638                 return -EIO;
639         }
640
641         r = chmod_and_chown(console, 0600, 0, 0);
642         if (r < 0) {
643                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
644                 return r;
645         }
646
647         if (asprintf(&to, "%s/dev/console", dest) < 0)
648                 return log_oom();
649
650         /* We need to bind mount the right tty to /dev/console since
651          * ptys can only exist on pts file systems. To have something
652          * to bind mount things on we create a device node first, that
653          * has the right major/minor (note that the major minor
654          * doesn't actually matter here, since we mount it over
655          * anyway). */
656
657         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
658                 log_error("mknod() for /dev/console failed: %m");
659                 return -errno;
660         }
661
662         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
663                 log_error("Bind mount for /dev/console failed: %m");
664                 return -errno;
665         }
666
667         return 0;
668 }
669
670 static int setup_kmsg(const char *dest, int kmsg_socket) {
671         char _cleanup_free_ *from = NULL, *to = NULL;
672         int r, fd, k;
673         mode_t _cleanup_umask_ u;
674         union {
675                 struct cmsghdr cmsghdr;
676                 uint8_t buf[CMSG_SPACE(sizeof(int))];
677         } control;
678         struct msghdr mh;
679         struct cmsghdr *cmsg;
680
681         assert(dest);
682         assert(kmsg_socket >= 0);
683
684         u = umask(0000);
685
686         /* We create the kmsg FIFO as /dev/kmsg, but immediately
687          * delete it after bind mounting it to /proc/kmsg. While FIFOs
688          * on the reading side behave very similar to /proc/kmsg,
689          * their writing side behaves differently from /dev/kmsg in
690          * that writing blocks when nothing is reading. In order to
691          * avoid any problems with containers deadlocking due to this
692          * we simply make /dev/kmsg unavailable to the container. */
693         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
694             asprintf(&to, "%s/proc/kmsg", dest) < 0)
695                 return log_oom();
696
697         if (mkfifo(from, 0600) < 0) {
698                 log_error("mkfifo() for /dev/kmsg failed: %m");
699                 return -errno;
700         }
701
702         r = chmod_and_chown(from, 0600, 0, 0);
703         if (r < 0) {
704                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
705                 return r;
706         }
707
708         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
709                 log_error("Bind mount for /proc/kmsg failed: %m");
710                 return -errno;
711         }
712
713         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
714         if (fd < 0) {
715                 log_error("Failed to open fifo: %m");
716                 return -errno;
717         }
718
719         zero(mh);
720         zero(control);
721
722         mh.msg_control = &control;
723         mh.msg_controllen = sizeof(control);
724
725         cmsg = CMSG_FIRSTHDR(&mh);
726         cmsg->cmsg_level = SOL_SOCKET;
727         cmsg->cmsg_type = SCM_RIGHTS;
728         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
729         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
730
731         mh.msg_controllen = cmsg->cmsg_len;
732
733         /* Store away the fd in the socket, so that it stays open as
734          * long as we run the child */
735         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
736         close_nointr_nofail(fd);
737
738         if (k < 0) {
739                 log_error("Failed to send FIFO fd: %m");
740                 return -errno;
741         }
742
743         /* And now make the FIFO unavailable as /dev/kmsg... */
744         unlink(from);
745         return 0;
746 }
747
748 static int setup_hostname(void) {
749         char *hn;
750         int r = 0;
751
752         hn = path_get_file_name(arg_directory);
753         if (hn) {
754                 hn = strdup(hn);
755                 if (!hn)
756                         return -ENOMEM;
757
758                 hostname_cleanup(hn);
759
760                 if (!isempty(hn))
761                         if (sethostname(hn, strlen(hn)) < 0)
762                                 r = -errno;
763
764                 free(hn);
765         }
766
767         return r;
768 }
769
770 static int setup_journal(const char *directory) {
771         sd_id128_t machine_id;
772         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
773         char *id;
774         int r;
775
776         if (arg_link_journal == LINK_NO)
777                 return 0;
778
779         p = strappend(directory, "/etc/machine-id");
780         if (!p)
781                 return log_oom();
782
783         r = read_one_line_file(p, &b);
784         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
785                 return 0;
786         else if (r < 0) {
787                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
788                 return r;
789         }
790
791         id = strstrip(b);
792         if (isempty(id) && arg_link_journal == LINK_AUTO)
793                 return 0;
794
795         /* Verify validity */
796         r = sd_id128_from_string(id, &machine_id);
797         if (r < 0) {
798                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
799                 return r;
800         }
801
802         free(p);
803         p = strappend("/var/log/journal/", id);
804         q = strjoin(directory, "/var/log/journal/", id, NULL);
805         if (!p || !q)
806                 return log_oom();
807
808         if (path_is_mount_point(p, false) > 0) {
809                 if (arg_link_journal != LINK_AUTO) {
810                         log_error("%s: already a mount point, refusing to use for journal", p);
811                         return -EEXIST;
812                 }
813
814                 return 0;
815         }
816
817         if (path_is_mount_point(q, false) > 0) {
818                 if (arg_link_journal != LINK_AUTO) {
819                         log_error("%s: already a mount point, refusing to use for journal", q);
820                         return -EEXIST;
821                 }
822
823                 return 0;
824         }
825
826         r = readlink_and_make_absolute(p, &d);
827         if (r >= 0) {
828                 if ((arg_link_journal == LINK_GUEST ||
829                      arg_link_journal == LINK_AUTO) &&
830                     path_equal(d, q)) {
831
832                         r = mkdir_p(q, 0755);
833                         if (r < 0)
834                                 log_warning("failed to create directory %s: %m", q);
835                         return 0;
836                 }
837
838                 if (unlink(p) < 0) {
839                         log_error("Failed to remove symlink %s: %m", p);
840                         return -errno;
841                 }
842         } else if (r == -EINVAL) {
843
844                 if (arg_link_journal == LINK_GUEST &&
845                     rmdir(p) < 0) {
846
847                         if (errno == ENOTDIR) {
848                                 log_error("%s already exists and is neither a symlink nor a directory", p);
849                                 return r;
850                         } else {
851                                 log_error("Failed to remove %s: %m", p);
852                                 return -errno;
853                         }
854                 }
855         } else if (r != -ENOENT) {
856                 log_error("readlink(%s) failed: %m", p);
857                 return r;
858         }
859
860         if (arg_link_journal == LINK_GUEST) {
861
862                 if (symlink(q, p) < 0) {
863                         log_error("Failed to symlink %s to %s: %m", q, p);
864                         return -errno;
865                 }
866
867                 r = mkdir_p(q, 0755);
868                 if (r < 0)
869                         log_warning("failed to create directory %s: %m", q);
870                 return 0;
871         }
872
873         if (arg_link_journal == LINK_HOST) {
874                 r = mkdir_p(p, 0755);
875                 if (r < 0) {
876                         log_error("Failed to create %s: %m", p);
877                         return r;
878                 }
879
880         } else if (access(p, F_OK) < 0)
881                 return 0;
882
883         if (dir_is_empty(q) == 0) {
884                 log_error("%s not empty.", q);
885                 return -ENOTEMPTY;
886         }
887
888         r = mkdir_p(q, 0755);
889         if (r < 0) {
890                 log_error("Failed to create %s: %m", q);
891                 return r;
892         }
893
894         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
895                 log_error("Failed to bind mount journal from host into guest: %m");
896                 return -errno;
897         }
898
899         return 0;
900 }
901
902 static int drop_capabilities(void) {
903         return capability_bounding_set_drop(~arg_retain, false);
904 }
905
906 static int is_os_tree(const char *path) {
907         int r;
908         char *p;
909         /* We use /bin/sh as flag file if something is an OS */
910
911         if (asprintf(&p, "%s/bin/sh", path) < 0)
912                 return -ENOMEM;
913
914         r = access(p, F_OK);
915         free(p);
916
917         return r < 0 ? 0 : 1;
918 }
919
920 static int process_pty(int master, pid_t pid, sigset_t *mask) {
921
922         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
923         size_t in_buffer_full = 0, out_buffer_full = 0;
924         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
925         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
926         int ep = -1, signal_fd = -1, r;
927         bool tried_orderly_shutdown = false;
928
929         assert(master >= 0);
930         assert(pid > 0);
931         assert(mask);
932
933         fd_nonblock(STDIN_FILENO, 1);
934         fd_nonblock(STDOUT_FILENO, 1);
935         fd_nonblock(master, 1);
936
937         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
938         if (signal_fd < 0) {
939                 log_error("signalfd(): %m");
940                 r = -errno;
941                 goto finish;
942         }
943
944         ep = epoll_create1(EPOLL_CLOEXEC);
945         if (ep < 0) {
946                 log_error("Failed to create epoll: %m");
947                 r = -errno;
948                 goto finish;
949         }
950
951         /* We read from STDIN only if this is actually a TTY,
952          * otherwise we assume non-interactivity. */
953         if (isatty(STDIN_FILENO)) {
954                 zero(stdin_ev);
955                 stdin_ev.events = EPOLLIN|EPOLLET;
956                 stdin_ev.data.fd = STDIN_FILENO;
957
958                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
959                         log_error("Failed to register STDIN in epoll: %m");
960                         r = -errno;
961                         goto finish;
962                 }
963         }
964
965         zero(stdout_ev);
966         stdout_ev.events = EPOLLOUT|EPOLLET;
967         stdout_ev.data.fd = STDOUT_FILENO;
968
969         zero(master_ev);
970         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
971         master_ev.data.fd = master;
972
973         zero(signal_ev);
974         signal_ev.events = EPOLLIN;
975         signal_ev.data.fd = signal_fd;
976
977         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
978                 if (errno != EPERM) {
979                         log_error("Failed to register stdout in epoll: %m");
980                         r = -errno;
981                         goto finish;
982                 }
983                 /* stdout without epoll support. Likely redirected to regular file. */
984                 stdout_writable = true;
985         }
986
987         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
988             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
989                 log_error("Failed to register fds in epoll: %m");
990                 r = -errno;
991                 goto finish;
992         }
993
994         for (;;) {
995                 struct epoll_event ev[16];
996                 ssize_t k;
997                 int i, nfds;
998
999                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1000                 if (nfds < 0) {
1001
1002                         if (errno == EINTR || errno == EAGAIN)
1003                                 continue;
1004
1005                         log_error("epoll_wait(): %m");
1006                         r = -errno;
1007                         goto finish;
1008                 }
1009
1010                 assert(nfds >= 1);
1011
1012                 for (i = 0; i < nfds; i++) {
1013                         if (ev[i].data.fd == STDIN_FILENO) {
1014
1015                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1016                                         stdin_readable = true;
1017
1018                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1019
1020                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1021                                         stdout_writable = true;
1022
1023                         } else if (ev[i].data.fd == master) {
1024
1025                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1026                                         master_readable = true;
1027
1028                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1029                                         master_writable = true;
1030
1031                         } else if (ev[i].data.fd == signal_fd) {
1032                                 struct signalfd_siginfo sfsi;
1033                                 ssize_t n;
1034
1035                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1036                                 if (n != sizeof(sfsi)) {
1037
1038                                         if (n >= 0) {
1039                                                 log_error("Failed to read from signalfd: invalid block size");
1040                                                 r = -EIO;
1041                                                 goto finish;
1042                                         }
1043
1044                                         if (errno != EINTR && errno != EAGAIN) {
1045                                                 log_error("Failed to read from signalfd: %m");
1046                                                 r = -errno;
1047                                                 goto finish;
1048                                         }
1049                                 } else {
1050
1051                                         if (sfsi.ssi_signo == SIGWINCH) {
1052                                                 struct winsize ws;
1053
1054                                                 /* The window size changed, let's forward that. */
1055                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1056                                                         ioctl(master, TIOCSWINSZ, &ws);
1057                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1058
1059                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1060
1061                                                 /* This only works for systemd... */
1062                                                 tried_orderly_shutdown = true;
1063                                                 kill(pid, SIGRTMIN+3);
1064
1065                                         } else {
1066                                                 r = 0;
1067                                                 goto finish;
1068                                         }
1069                                 }
1070                         }
1071                 }
1072
1073                 while ((stdin_readable && in_buffer_full <= 0) ||
1074                        (master_writable && in_buffer_full > 0) ||
1075                        (master_readable && out_buffer_full <= 0) ||
1076                        (stdout_writable && out_buffer_full > 0)) {
1077
1078                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1079
1080                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1081                                 if (k < 0) {
1082
1083                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1084                                                 stdin_readable = false;
1085                                         else {
1086                                                 log_error("read(): %m");
1087                                                 r = -errno;
1088                                                 goto finish;
1089                                         }
1090                                 } else
1091                                         in_buffer_full += (size_t) k;
1092                         }
1093
1094                         if (master_writable && in_buffer_full > 0) {
1095
1096                                 k = write(master, in_buffer, in_buffer_full);
1097                                 if (k < 0) {
1098
1099                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1100                                                 master_writable = false;
1101                                         else {
1102                                                 log_error("write(): %m");
1103                                                 r = -errno;
1104                                                 goto finish;
1105                                         }
1106
1107                                 } else {
1108                                         assert(in_buffer_full >= (size_t) k);
1109                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1110                                         in_buffer_full -= k;
1111                                 }
1112                         }
1113
1114                         if (master_readable && out_buffer_full < LINE_MAX) {
1115
1116                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1117                                 if (k < 0) {
1118
1119                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1120                                                 master_readable = false;
1121                                         else {
1122                                                 log_error("read(): %m");
1123                                                 r = -errno;
1124                                                 goto finish;
1125                                         }
1126                                 }  else
1127                                         out_buffer_full += (size_t) k;
1128                         }
1129
1130                         if (stdout_writable && out_buffer_full > 0) {
1131
1132                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1133                                 if (k < 0) {
1134
1135                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1136                                                 stdout_writable = false;
1137                                         else {
1138                                                 log_error("write(): %m");
1139                                                 r = -errno;
1140                                                 goto finish;
1141                                         }
1142
1143                                 } else {
1144                                         assert(out_buffer_full >= (size_t) k);
1145                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1146                                         out_buffer_full -= k;
1147                                 }
1148                         }
1149                 }
1150         }
1151
1152 finish:
1153         if (ep >= 0)
1154                 close_nointr_nofail(ep);
1155
1156         if (signal_fd >= 0)
1157                 close_nointr_nofail(signal_fd);
1158
1159         return r;
1160 }
1161
1162 int main(int argc, char *argv[]) {
1163         pid_t pid = 0;
1164         int r = EXIT_FAILURE, k;
1165         char *oldcg = NULL, *newcg = NULL;
1166         char **controller = NULL;
1167         int master = -1, n_fd_passed;
1168         const char *console = NULL;
1169         struct termios saved_attr, raw_attr;
1170         sigset_t mask;
1171         bool saved_attr_valid = false;
1172         struct winsize ws;
1173         int kmsg_socket_pair[2] = { -1, -1 };
1174         FDSet *fds = NULL;
1175
1176         log_parse_environment();
1177         log_open();
1178
1179         r = parse_argv(argc, argv);
1180         if (r <= 0)
1181                 goto finish;
1182
1183         if (arg_directory) {
1184                 char *p;
1185
1186                 p = path_make_absolute_cwd(arg_directory);
1187                 free(arg_directory);
1188                 arg_directory = p;
1189         } else
1190                 arg_directory = get_current_dir_name();
1191
1192         if (!arg_directory) {
1193                 log_error("Failed to determine path");
1194                 goto finish;
1195         }
1196
1197         path_kill_slashes(arg_directory);
1198
1199         if (geteuid() != 0) {
1200                 log_error("Need to be root.");
1201                 goto finish;
1202         }
1203
1204         if (sd_booted() <= 0) {
1205                 log_error("Not running on a systemd system.");
1206                 goto finish;
1207         }
1208
1209         if (path_equal(arg_directory, "/")) {
1210                 log_error("Spawning container on root directory not supported.");
1211                 goto finish;
1212         }
1213
1214         if (is_os_tree(arg_directory) <= 0) {
1215                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1216                 goto finish;
1217         }
1218
1219         log_close();
1220         n_fd_passed = sd_listen_fds(false);
1221         if (n_fd_passed > 0) {
1222                 k = fdset_new_listen_fds(&fds, false);
1223                 if (k < 0) {
1224                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1225                         goto finish;
1226                 }
1227         }
1228         fdset_close_others(fds);
1229         log_open();
1230
1231         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1232         if (k < 0) {
1233                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1234                 goto finish;
1235         }
1236
1237         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1238                 log_error("Failed to allocate cgroup path.");
1239                 goto finish;
1240         }
1241
1242         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1243         if (k < 0)  {
1244                 log_error("Failed to create cgroup: %s", strerror(-k));
1245                 goto finish;
1246         }
1247
1248         STRV_FOREACH(controller, arg_controllers) {
1249                 k = cg_create_and_attach(*controller, newcg, 0);
1250                 if (k < 0)
1251                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1252         }
1253
1254         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1255         if (master < 0) {
1256                 log_error("Failed to acquire pseudo tty: %m");
1257                 goto finish;
1258         }
1259
1260         console = ptsname(master);
1261         if (!console) {
1262                 log_error("Failed to determine tty name: %m");
1263                 goto finish;
1264         }
1265
1266         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1267
1268         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1269                 ioctl(master, TIOCSWINSZ, &ws);
1270
1271         if (unlockpt(master) < 0) {
1272                 log_error("Failed to unlock tty: %m");
1273                 goto finish;
1274         }
1275
1276         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1277                 saved_attr_valid = true;
1278
1279                 raw_attr = saved_attr;
1280                 cfmakeraw(&raw_attr);
1281                 raw_attr.c_lflag &= ~ECHO;
1282         }
1283
1284         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1285                 log_error("Failed to create kmsg socket pair");
1286                 goto finish;
1287         }
1288
1289         assert_se(sigemptyset(&mask) == 0);
1290         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1291         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1292
1293         for (;;) {
1294                 siginfo_t status;
1295                 int pipefd[2];
1296
1297                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1298                         log_error("pipe2(): %m");
1299                         goto finish;
1300                 }
1301
1302                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1303                 if (pid < 0) {
1304                         if (errno == EINVAL)
1305                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1306                         else
1307                                 log_error("clone() failed: %m");
1308
1309                         goto finish;
1310                 }
1311
1312                 if (pid == 0) {
1313                         /* child */
1314                         const char *home = NULL;
1315                         uid_t uid = (uid_t) -1;
1316                         gid_t gid = (gid_t) -1;
1317                         unsigned n_env = 2;
1318                         const char *envp[] = {
1319                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1320                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1321                                 NULL, /* TERM */
1322                                 NULL, /* HOME */
1323                                 NULL, /* USER */
1324                                 NULL, /* LOGNAME */
1325                                 NULL, /* container_uuid */
1326                                 NULL, /* LISTEN_FDS */
1327                                 NULL, /* LISTEN_PID */
1328                                 NULL
1329                         };
1330
1331                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1332                         if (envp[n_env])
1333                                 n_env ++;
1334
1335                         close_nointr_nofail(pipefd[1]);
1336                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1337                         close_nointr_nofail(pipefd[0]);
1338
1339                         close_nointr_nofail(master);
1340                         master = -1;
1341
1342                         if (saved_attr_valid) {
1343                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1344                                         log_error("Failed to set terminal attributes: %m");
1345                                         goto child_fail;
1346                                 }
1347                         }
1348
1349                         close_nointr(STDIN_FILENO);
1350                         close_nointr(STDOUT_FILENO);
1351                         close_nointr(STDERR_FILENO);
1352
1353                         close_nointr_nofail(kmsg_socket_pair[0]);
1354                         kmsg_socket_pair[0] = -1;
1355
1356                         reset_all_signal_handlers();
1357
1358                         assert_se(sigemptyset(&mask) == 0);
1359                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1360
1361                         k = open_terminal(console, O_RDWR);
1362                         if (k != STDIN_FILENO) {
1363                                 if (k >= 0) {
1364                                         close_nointr_nofail(k);
1365                                         k = -EINVAL;
1366                                 }
1367
1368                                 log_error("Failed to open console: %s", strerror(-k));
1369                                 goto child_fail;
1370                         }
1371
1372                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1373                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1374                                 log_error("Failed to duplicate console: %m");
1375                                 goto child_fail;
1376                         }
1377
1378                         if (setsid() < 0) {
1379                                 log_error("setsid() failed: %m");
1380                                 goto child_fail;
1381                         }
1382
1383                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1384                                 log_error("PR_SET_PDEATHSIG failed: %m");
1385                                 goto child_fail;
1386                         }
1387
1388                         /* Mark everything as slave, so that we still
1389                          * receive mounts from the real root, but don't
1390                          * propagate mounts to the real root. */
1391                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1392                                 log_error("MS_SLAVE|MS_REC failed: %m");
1393                                 goto child_fail;
1394                         }
1395
1396                         /* Turn directory into bind mount */
1397                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1398                                 log_error("Failed to make bind mount.");
1399                                 goto child_fail;
1400                         }
1401
1402                         if (arg_read_only)
1403                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1404                                         log_error("Failed to make read-only.");
1405                                         goto child_fail;
1406                                 }
1407
1408                         if (mount_all(arg_directory) < 0)
1409                                 goto child_fail;
1410
1411                         if (copy_devnodes(arg_directory) < 0)
1412                                 goto child_fail;
1413
1414                         if (setup_ptmx(arg_directory) < 0)
1415                                 goto child_fail;
1416
1417                         dev_setup(arg_directory);
1418
1419                         if (setup_dev_console(arg_directory, console) < 0)
1420                                 goto child_fail;
1421
1422                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1423                                 goto child_fail;
1424
1425                         close_nointr_nofail(kmsg_socket_pair[1]);
1426                         kmsg_socket_pair[1] = -1;
1427
1428                         if (setup_boot_id(arg_directory) < 0)
1429                                 goto child_fail;
1430
1431                         if (setup_timezone(arg_directory) < 0)
1432                                 goto child_fail;
1433
1434                         if (setup_resolv_conf(arg_directory) < 0)
1435                                 goto child_fail;
1436
1437                         if (setup_journal(arg_directory) < 0)
1438                                 goto child_fail;
1439
1440                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1441                                 goto child_fail;
1442
1443                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1444                                 goto child_fail;
1445
1446                         if (chdir(arg_directory) < 0) {
1447                                 log_error("chdir(%s) failed: %m", arg_directory);
1448                                 goto child_fail;
1449                         }
1450
1451                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1452                                 log_error("mount(MS_MOVE) failed: %m");
1453                                 goto child_fail;
1454                         }
1455
1456                         if (chroot(".") < 0) {
1457                                 log_error("chroot() failed: %m");
1458                                 goto child_fail;
1459                         }
1460
1461                         if (chdir("/") < 0) {
1462                                 log_error("chdir() failed: %m");
1463                                 goto child_fail;
1464                         }
1465
1466                         umask(0022);
1467
1468                         loopback_setup();
1469
1470                         if (drop_capabilities() < 0) {
1471                                 log_error("drop_capabilities() failed: %m");
1472                                 goto child_fail;
1473                         }
1474
1475                         if (arg_user) {
1476
1477                                 /* Note that this resolves user names
1478                                  * inside the container, and hence
1479                                  * accesses the NSS modules from the
1480                                  * container and not the host. This is
1481                                  * a bit weird... */
1482
1483                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1484                                         log_error("get_user_creds() failed: %m");
1485                                         goto child_fail;
1486                                 }
1487
1488                                 if (mkdir_parents_label(home, 0775) < 0) {
1489                                         log_error("mkdir_parents_label() failed: %m");
1490                                         goto child_fail;
1491                                 }
1492
1493                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1494                                         log_error("mkdir_safe_label() failed: %m");
1495                                         goto child_fail;
1496                                 }
1497
1498                                 if (initgroups((const char*)arg_user, gid) < 0) {
1499                                         log_error("initgroups() failed: %m");
1500                                         goto child_fail;
1501                                 }
1502
1503                                 if (setresgid(gid, gid, gid) < 0) {
1504                                         log_error("setregid() failed: %m");
1505                                         goto child_fail;
1506                                 }
1507
1508                                 if (setresuid(uid, uid, uid) < 0) {
1509                                         log_error("setreuid() failed: %m");
1510                                         goto child_fail;
1511                                 }
1512                         } else {
1513                                 /* Reset everything fully to 0, just in case */
1514
1515                                 if (setgroups(0, NULL) < 0) {
1516                                         log_error("setgroups() failed: %m");
1517                                         goto child_fail;
1518                                 }
1519
1520                                 if (setresgid(0, 0, 0) < 0) {
1521                                         log_error("setregid() failed: %m");
1522                                         goto child_fail;
1523                                 }
1524
1525                                 if (setresuid(0, 0, 0) < 0) {
1526                                         log_error("setreuid() failed: %m");
1527                                         goto child_fail;
1528                                 }
1529                         }
1530
1531                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1532                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1533                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1534                                 log_oom();
1535                                 goto child_fail;
1536                         }
1537
1538                         if (arg_uuid) {
1539                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1540                                         log_oom();
1541                                         goto child_fail;
1542                                 }
1543                         }
1544
1545                         if (fdset_size(fds) > 0) {
1546                                 k = fdset_cloexec(fds, false);
1547                                 if (k < 0) {
1548                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1549                                         goto child_fail;
1550                                 }
1551
1552                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1553                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1554                                         log_oom();
1555                                         goto child_fail;
1556                                 }
1557                         }
1558
1559                         setup_hostname();
1560
1561                         if (arg_boot) {
1562                                 char **a;
1563                                 size_t l;
1564
1565                                 /* Automatically search for the init system */
1566
1567                                 l = 1 + argc - optind;
1568                                 a = newa(char*, l + 1);
1569                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1570
1571                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1572                                 execve(a[0], a, (char**) envp);
1573
1574                                 a[0] = (char*) "/lib/systemd/systemd";
1575                                 execve(a[0], a, (char**) envp);
1576
1577                                 a[0] = (char*) "/sbin/init";
1578                                 execve(a[0], a, (char**) envp);
1579                         } else if (argc > optind)
1580                                 execvpe(argv[optind], argv + optind, (char**) envp);
1581                         else {
1582                                 chdir(home ? home : "/root");
1583                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1584                         }
1585
1586                         log_error("execv() failed: %m");
1587
1588                 child_fail:
1589                         _exit(EXIT_FAILURE);
1590                 }
1591
1592                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1593                 close_nointr_nofail(pipefd[0]);
1594                 close_nointr_nofail(pipefd[1]);
1595
1596                 fdset_free(fds);
1597                 fds = NULL;
1598
1599                 if (process_pty(master, pid, &mask) < 0)
1600                         goto finish;
1601
1602                 if (saved_attr_valid)
1603                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1604
1605                 r = wait_for_terminate(pid, &status);
1606                 if (r < 0) {
1607                         r = EXIT_FAILURE;
1608                         break;
1609                 }
1610
1611                 if (status.si_code == CLD_EXITED) {
1612                         if (status.si_status != 0) {
1613                                 log_error("Container failed with error code %i.", status.si_status);
1614                                 r = status.si_status;
1615                                 break;
1616                         }
1617
1618                         log_debug("Container exited successfully.");
1619                         break;
1620                 } else if (status.si_code == CLD_KILLED &&
1621                            status.si_status == SIGINT) {
1622                         log_info("Container has been shut down.");
1623                         r = 0;
1624                         break;
1625                 } else if (status.si_code == CLD_KILLED &&
1626                            status.si_status == SIGHUP) {
1627                         log_info("Container is being rebooted.");
1628                         continue;
1629                 } else if (status.si_code == CLD_KILLED ||
1630                            status.si_code == CLD_DUMPED) {
1631
1632                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1633                         r = EXIT_FAILURE;
1634                         break;
1635                 } else {
1636                         log_error("Container failed due to unknown reason.");
1637                         r = EXIT_FAILURE;
1638                         break;
1639                 }
1640         }
1641
1642 finish:
1643         if (saved_attr_valid)
1644                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1645
1646         if (master >= 0)
1647                 close_nointr_nofail(master);
1648
1649         close_pipe(kmsg_socket_pair);
1650
1651         if (oldcg)
1652                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1653
1654         if (newcg)
1655                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1656
1657         free(arg_directory);
1658         strv_free(arg_controllers);
1659         free(oldcg);
1660         free(newcg);
1661
1662         fdset_free(fds);
1663
1664         return r;
1665 }