chiark / gitweb /
b90ccc53ef51fae8d4ffaf29cb7ce6463761a789
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #include <systemd/sd-daemon.h>
46
47 #include "log.h"
48 #include "util.h"
49 #include "mkdir.h"
50 #include "macro.h"
51 #include "audit.h"
52 #include "missing.h"
53 #include "cgroup-util.h"
54 #include "strv.h"
55 #include "path-util.h"
56 #include "loopback-setup.h"
57 #include "sd-id128.h"
58 #include "dev-setup.h"
59 #include "fdset.h"
60 #include "build.h"
61 #include "fileio.h"
62
63 #ifndef TTY_GID
64 #define TTY_GID 5
65 #endif
66
67 typedef enum LinkJournal {
68         LINK_NO,
69         LINK_AUTO,
70         LINK_HOST,
71         LINK_GUEST
72 } LinkJournal;
73
74 static char *arg_directory = NULL;
75 static char *arg_user = NULL;
76 static char **arg_controllers = NULL;
77 static char *arg_uuid = NULL;
78 static bool arg_private_network = false;
79 static bool arg_read_only = false;
80 static bool arg_boot = false;
81 static LinkJournal arg_link_journal = LINK_AUTO;
82 static uint64_t arg_retain =
83         (1ULL << CAP_CHOWN) |
84         (1ULL << CAP_DAC_OVERRIDE) |
85         (1ULL << CAP_DAC_READ_SEARCH) |
86         (1ULL << CAP_FOWNER) |
87         (1ULL << CAP_FSETID) |
88         (1ULL << CAP_IPC_OWNER) |
89         (1ULL << CAP_KILL) |
90         (1ULL << CAP_LEASE) |
91         (1ULL << CAP_LINUX_IMMUTABLE) |
92         (1ULL << CAP_NET_BIND_SERVICE) |
93         (1ULL << CAP_NET_BROADCAST) |
94         (1ULL << CAP_NET_RAW) |
95         (1ULL << CAP_SETGID) |
96         (1ULL << CAP_SETFCAP) |
97         (1ULL << CAP_SETPCAP) |
98         (1ULL << CAP_SETUID) |
99         (1ULL << CAP_SYS_ADMIN) |
100         (1ULL << CAP_SYS_CHROOT) |
101         (1ULL << CAP_SYS_NICE) |
102         (1ULL << CAP_SYS_PTRACE) |
103         (1ULL << CAP_SYS_TTY_CONFIG) |
104         (1ULL << CAP_SYS_RESOURCE) |
105         (1ULL << CAP_SYS_BOOT) |
106         (1ULL << CAP_AUDIT_WRITE) |
107         (1ULL << CAP_AUDIT_CONTROL);
108 static char **arg_bind = NULL;
109 static char **arg_bind_ro = NULL;
110
111 static int help(void) {
112
113         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
114                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
115                "  -h --help                Show this help\n"
116                "  --version                Print version string\n"
117                "  -D --directory=NAME      Root directory for the container\n"
118                "  -b --boot                Boot up full system (i.e. invoke init)\n"
119                "  -u --user=USER           Run the command under specified user or uid\n"
120                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
121                "                           cgroup hierarchies\n"
122                "     --uuid=UUID           Set a specific machine UUID for the container\n"
123                "     --private-network     Disable network in container\n"
124                "     --read-only           Mount the root directory read-only\n"
125                "     --capability=CAP      In addition to the default, retain specified\n"
126                "                           capability\n"
127                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
128                "  -j                       Equivalent to --link-journal=host\n"
129                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
130                "                           the container\n"
131                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
132                program_invocation_short_name);
133
134         return 0;
135 }
136
137 static int parse_argv(int argc, char *argv[]) {
138
139         enum {
140                 ARG_VERSION = 0x100,
141                 ARG_PRIVATE_NETWORK,
142                 ARG_UUID,
143                 ARG_READ_ONLY,
144                 ARG_CAPABILITY,
145                 ARG_LINK_JOURNAL,
146                 ARG_BIND,
147                 ARG_BIND_RO
148         };
149
150         static const struct option options[] = {
151                 { "help",            no_argument,       NULL, 'h'                 },
152                 { "version",         no_argument,       NULL, ARG_VERSION         },
153                 { "directory",       required_argument, NULL, 'D'                 },
154                 { "user",            required_argument, NULL, 'u'                 },
155                 { "controllers",     required_argument, NULL, 'C'                 },
156                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
157                 { "boot",            no_argument,       NULL, 'b'                 },
158                 { "uuid",            required_argument, NULL, ARG_UUID            },
159                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
160                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
161                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
162                 { "bind",            required_argument, NULL, ARG_BIND            },
163                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
164                 { NULL,              0,                 NULL, 0                   }
165         };
166
167         int c;
168
169         assert(argc >= 0);
170         assert(argv);
171
172         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
173
174                 switch (c) {
175
176                 case 'h':
177                         help();
178                         return 0;
179
180                 case ARG_VERSION:
181                         puts(PACKAGE_STRING);
182                         puts(SYSTEMD_FEATURES);
183                         return 0;
184
185                 case 'D':
186                         free(arg_directory);
187                         arg_directory = canonicalize_file_name(optarg);
188                         if (!arg_directory) {
189                                 log_error("Failed to canonicalize root directory.");
190                                 return -ENOMEM;
191                         }
192
193                         break;
194
195                 case 'u':
196                         free(arg_user);
197                         if (!(arg_user = strdup(optarg))) {
198                                 log_error("Failed to duplicate user name.");
199                                 return -ENOMEM;
200                         }
201
202                         break;
203
204                 case 'C':
205                         strv_free(arg_controllers);
206                         arg_controllers = strv_split(optarg, ",");
207                         if (!arg_controllers) {
208                                 log_error("Failed to split controllers list.");
209                                 return -ENOMEM;
210                         }
211                         strv_uniq(arg_controllers);
212
213                         break;
214
215                 case ARG_PRIVATE_NETWORK:
216                         arg_private_network = true;
217                         break;
218
219                 case 'b':
220                         arg_boot = true;
221                         break;
222
223                 case ARG_UUID:
224                         arg_uuid = optarg;
225                         break;
226
227                 case ARG_READ_ONLY:
228                         arg_read_only = true;
229                         break;
230
231                 case ARG_CAPABILITY: {
232                         char *state, *word;
233                         size_t length;
234
235                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
236                                 cap_value_t cap;
237                                 char *t;
238
239                                 t = strndup(word, length);
240                                 if (!t)
241                                         return log_oom();
242
243                                 if (cap_from_name(t, &cap) < 0) {
244                                         log_error("Failed to parse capability %s.", t);
245                                         free(t);
246                                         return -EINVAL;
247                                 }
248
249                                 free(t);
250                                 arg_retain |= 1ULL << (uint64_t) cap;
251                         }
252
253                         break;
254                 }
255
256                 case 'j':
257                         arg_link_journal = LINK_GUEST;
258                         break;
259
260                 case ARG_LINK_JOURNAL:
261                         if (streq(optarg, "auto"))
262                                 arg_link_journal = LINK_AUTO;
263                         else if (streq(optarg, "no"))
264                                 arg_link_journal = LINK_NO;
265                         else if (streq(optarg, "guest"))
266                                 arg_link_journal = LINK_GUEST;
267                         else if (streq(optarg, "host"))
268                                 arg_link_journal = LINK_HOST;
269                         else {
270                                 log_error("Failed to parse link journal mode %s", optarg);
271                                 return -EINVAL;
272                         }
273
274                         break;
275
276                 case ARG_BIND:
277                 case ARG_BIND_RO: {
278                         _cleanup_free_ char *a = NULL, *b = NULL;
279                         char *e;
280                         char ***x;
281                         int r;
282
283                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
284
285                         e = strchr(optarg, ':');
286                         if (e) {
287                                 a = strndup(optarg, e - optarg);
288                                 b = strdup(e + 1);
289                         } else {
290                                 a = strdup(optarg);
291                                 b = strdup(optarg);
292                         }
293
294                         if (!a || !b)
295                                 return log_oom();
296
297                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
298                                 log_error("Invalid bind mount specification: %s", optarg);
299                                 return -EINVAL;
300                         }
301
302                         r = strv_extend(x, a);
303                         if (r < 0)
304                                 return r;
305
306                         r = strv_extend(x, b);
307                         if (r < 0)
308                                 return r;
309
310                         break;
311                 }
312
313                 case '?':
314                         return -EINVAL;
315
316                 default:
317                         log_error("Unknown option code %c", c);
318                         return -EINVAL;
319                 }
320         }
321
322         return 1;
323 }
324
325 static int mount_all(const char *dest) {
326
327         typedef struct MountPoint {
328                 const char *what;
329                 const char *where;
330                 const char *type;
331                 const char *options;
332                 unsigned long flags;
333                 bool fatal;
334         } MountPoint;
335
336         static const MountPoint mount_table[] = {
337                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
338                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
339                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
340                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
341                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
342                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
343                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
344                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
345 #ifdef HAVE_SELINUX
346                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
347                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
348 #endif
349         };
350
351         unsigned k;
352         int r = 0;
353
354         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
355                 char _cleanup_free_ *where = NULL;
356                 int t;
357
358                 where = strjoin(dest, "/", mount_table[k].where, NULL);
359                 if (!where)
360                         return log_oom();
361
362                 t = path_is_mount_point(where, true);
363                 if (t < 0) {
364                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
365
366                         if (r == 0)
367                                 r = t;
368
369                         continue;
370                 }
371
372                 /* Skip this entry if it is not a remount. */
373                 if (mount_table[k].what && t > 0)
374                         continue;
375
376                 mkdir_p(where, 0755);
377
378                 if (mount(mount_table[k].what,
379                           where,
380                           mount_table[k].type,
381                           mount_table[k].flags,
382                           mount_table[k].options) < 0 &&
383                     mount_table[k].fatal) {
384
385                         log_error("mount(%s) failed: %m", where);
386
387                         if (r == 0)
388                                 r = -errno;
389                 }
390         }
391
392         return r;
393 }
394
395 static int mount_binds(const char *dest, char **l, unsigned long flags) {
396         char **x, **y;
397
398         STRV_FOREACH_PAIR(x, y, l) {
399                 _cleanup_free_ char *where = NULL;
400
401                 where = strjoin(dest, "/", *y, NULL);
402                 if (!where)
403                         return log_oom();
404
405                 mkdir_p_label(where, 0755);
406
407                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
408                         log_error("mount(%s) failed: %m", where);
409                         return -errno;
410                 }
411
412                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
413                         log_error("mount(%s) failed: %m", where);
414                         return -errno;
415                 }
416         }
417
418         return 0;
419 }
420
421 static int setup_timezone(const char *dest) {
422         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
423         char *z, *y;
424         int r;
425
426         assert(dest);
427
428         /* Fix the timezone, if possible */
429         r = readlink_malloc("/etc/localtime", &p);
430         if (r < 0) {
431                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
432                 return 0;
433         }
434
435         z = path_startswith(p, "../usr/share/zoneinfo/");
436         if (!z)
437                 z = path_startswith(p, "/usr/share/zoneinfo/");
438         if (!z) {
439                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
440                 return 0;
441         }
442
443         where = strappend(dest, "/etc/localtime");
444         if (!where)
445                 return log_oom();
446
447         r = readlink_malloc(where, &q);
448         if (r >= 0) {
449                 y = path_startswith(q, "../usr/share/zoneinfo/");
450                 if (!y)
451                         y = path_startswith(q, "/usr/share/zoneinfo/");
452
453
454                 /* Already pointing to the right place? Then do nothing .. */
455                 if (y && streq(y, z))
456                         return 0;
457         }
458
459         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
460         if (!check)
461                 return log_oom();
462
463         if (access(check, F_OK) < 0) {
464                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
465                 return 0;
466         }
467
468         what = strappend("../usr/share/zoneinfo/", z);
469         if (!what)
470                 return log_oom();
471
472         unlink(where);
473         if (symlink(what, where) < 0) {
474                 log_error("Failed to correct timezone of container: %m");
475                 return 0;
476         }
477
478         return 0;
479 }
480
481 static int setup_resolv_conf(const char *dest) {
482         char *where;
483
484         assert(dest);
485
486         if (arg_private_network)
487                 return 0;
488
489         /* Fix resolv.conf, if possible */
490         where = strappend(dest, "/etc/resolv.conf");
491         if (!where)
492                 return log_oom();
493
494         /* We don't really care for the results of this really. If it
495          * fails, it fails, but meh... */
496         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
497                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
498
499         free(where);
500
501         return 0;
502 }
503
504 static int setup_boot_id(const char *dest) {
505         char _cleanup_free_ *from = NULL, *to = NULL;
506         sd_id128_t rnd;
507         char as_uuid[37];
508         int r;
509
510         assert(dest);
511
512         /* Generate a new randomized boot ID, so that each boot-up of
513          * the container gets a new one */
514
515         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
516         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
517         if (!from || !to)
518                 return log_oom();
519
520         r = sd_id128_randomize(&rnd);
521         if (r < 0) {
522                 log_error("Failed to generate random boot id: %s", strerror(-r));
523                 return r;
524         }
525
526         snprintf(as_uuid, sizeof(as_uuid),
527                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
528                  SD_ID128_FORMAT_VAL(rnd));
529         char_array_0(as_uuid);
530
531         r = write_string_file(from, as_uuid);
532         if (r < 0) {
533                 log_error("Failed to write boot id: %s", strerror(-r));
534                 return r;
535         }
536
537         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
538                 log_error("Failed to bind mount boot id: %m");
539                 r = -errno;
540         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
541                 log_warning("Failed to make boot id read-only: %m");
542
543         unlink(from);
544         return r;
545 }
546
547 static int copy_devnodes(const char *dest) {
548
549         static const char devnodes[] =
550                 "null\0"
551                 "zero\0"
552                 "full\0"
553                 "random\0"
554                 "urandom\0"
555                 "tty\0";
556
557         const char *d;
558         int r = 0;
559         mode_t _cleanup_umask_ u;
560
561         assert(dest);
562
563         u = umask(0000);
564
565         NULSTR_FOREACH(d, devnodes) {
566                 struct stat st;
567                 char _cleanup_free_ *from = NULL, *to = NULL;
568
569                 asprintf(&from, "/dev/%s", d);
570                 asprintf(&to, "%s/dev/%s", dest, d);
571
572                 if (!from || !to) {
573                         log_oom();
574
575                         if (r == 0)
576                                 r = -ENOMEM;
577
578                         break;
579                 }
580
581                 if (stat(from, &st) < 0) {
582
583                         if (errno != ENOENT) {
584                                 log_error("Failed to stat %s: %m", from);
585                                 if (r == 0)
586                                         r = -errno;
587                         }
588
589                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
590
591                         log_error("%s is not a char or block device, cannot copy", from);
592                         if (r == 0)
593                                 r = -EIO;
594
595                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
596
597                         log_error("mknod(%s) failed: %m", dest);
598                         if (r == 0)
599                                 r = -errno;
600                 }
601         }
602
603         return r;
604 }
605
606 static int setup_ptmx(const char *dest) {
607         _cleanup_free_ char *p = NULL;
608
609         p = strappend(dest, "/dev/ptmx");
610         if (!p)
611                 return log_oom();
612
613         if (symlink("pts/ptmx", p) < 0) {
614                 log_error("Failed to create /dev/ptmx symlink: %m");
615                 return -errno;
616         }
617
618         return 0;
619 }
620
621 static int setup_dev_console(const char *dest, const char *console) {
622         struct stat st;
623         char _cleanup_free_ *to = NULL;
624         int r;
625         mode_t _cleanup_umask_ u;
626
627         assert(dest);
628         assert(console);
629
630         u = umask(0000);
631
632         if (stat(console, &st) < 0) {
633                 log_error("Failed to stat %s: %m", console);
634                 return -errno;
635
636         } else if (!S_ISCHR(st.st_mode)) {
637                 log_error("/dev/console is not a char device");
638                 return -EIO;
639         }
640
641         r = chmod_and_chown(console, 0600, 0, 0);
642         if (r < 0) {
643                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
644                 return r;
645         }
646
647         if (asprintf(&to, "%s/dev/console", dest) < 0)
648                 return log_oom();
649
650         /* We need to bind mount the right tty to /dev/console since
651          * ptys can only exist on pts file systems. To have something
652          * to bind mount things on we create a device node first, that
653          * has the right major/minor (note that the major minor
654          * doesn't actually matter here, since we mount it over
655          * anyway). */
656
657         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
658                 log_error("mknod() for /dev/console failed: %m");
659                 return -errno;
660         }
661
662         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
663                 log_error("Bind mount for /dev/console failed: %m");
664                 return -errno;
665         }
666
667         return 0;
668 }
669
670 static int setup_kmsg(const char *dest, int kmsg_socket) {
671         char _cleanup_free_ *from = NULL, *to = NULL;
672         int r, fd, k;
673         mode_t _cleanup_umask_ u;
674         union {
675                 struct cmsghdr cmsghdr;
676                 uint8_t buf[CMSG_SPACE(sizeof(int))];
677         } control = {};
678         struct msghdr mh = {
679                 .msg_control = &control,
680                 .msg_controllen = sizeof(control),
681         };
682         struct cmsghdr *cmsg;
683
684         assert(dest);
685         assert(kmsg_socket >= 0);
686
687         u = umask(0000);
688
689         /* We create the kmsg FIFO as /dev/kmsg, but immediately
690          * delete it after bind mounting it to /proc/kmsg. While FIFOs
691          * on the reading side behave very similar to /proc/kmsg,
692          * their writing side behaves differently from /dev/kmsg in
693          * that writing blocks when nothing is reading. In order to
694          * avoid any problems with containers deadlocking due to this
695          * we simply make /dev/kmsg unavailable to the container. */
696         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
697             asprintf(&to, "%s/proc/kmsg", dest) < 0)
698                 return log_oom();
699
700         if (mkfifo(from, 0600) < 0) {
701                 log_error("mkfifo() for /dev/kmsg failed: %m");
702                 return -errno;
703         }
704
705         r = chmod_and_chown(from, 0600, 0, 0);
706         if (r < 0) {
707                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
708                 return r;
709         }
710
711         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
712                 log_error("Bind mount for /proc/kmsg failed: %m");
713                 return -errno;
714         }
715
716         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
717         if (fd < 0) {
718                 log_error("Failed to open fifo: %m");
719                 return -errno;
720         }
721
722         cmsg = CMSG_FIRSTHDR(&mh);
723         cmsg->cmsg_level = SOL_SOCKET;
724         cmsg->cmsg_type = SCM_RIGHTS;
725         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
726         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
727
728         mh.msg_controllen = cmsg->cmsg_len;
729
730         /* Store away the fd in the socket, so that it stays open as
731          * long as we run the child */
732         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
733         close_nointr_nofail(fd);
734
735         if (k < 0) {
736                 log_error("Failed to send FIFO fd: %m");
737                 return -errno;
738         }
739
740         /* And now make the FIFO unavailable as /dev/kmsg... */
741         unlink(from);
742         return 0;
743 }
744
745 static int setup_hostname(void) {
746         char *hn;
747         int r = 0;
748
749         hn = path_get_file_name(arg_directory);
750         if (hn) {
751                 hn = strdup(hn);
752                 if (!hn)
753                         return -ENOMEM;
754
755                 hostname_cleanup(hn);
756
757                 if (!isempty(hn))
758                         if (sethostname(hn, strlen(hn)) < 0)
759                                 r = -errno;
760
761                 free(hn);
762         }
763
764         return r;
765 }
766
767 static int setup_journal(const char *directory) {
768         sd_id128_t machine_id;
769         char _cleanup_free_ *p = NULL, *b = NULL, *q = NULL, *d = NULL;
770         char *id;
771         int r;
772
773         if (arg_link_journal == LINK_NO)
774                 return 0;
775
776         p = strappend(directory, "/etc/machine-id");
777         if (!p)
778                 return log_oom();
779
780         r = read_one_line_file(p, &b);
781         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
782                 return 0;
783         else if (r < 0) {
784                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
785                 return r;
786         }
787
788         id = strstrip(b);
789         if (isempty(id) && arg_link_journal == LINK_AUTO)
790                 return 0;
791
792         /* Verify validity */
793         r = sd_id128_from_string(id, &machine_id);
794         if (r < 0) {
795                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
796                 return r;
797         }
798
799         free(p);
800         p = strappend("/var/log/journal/", id);
801         q = strjoin(directory, "/var/log/journal/", id, NULL);
802         if (!p || !q)
803                 return log_oom();
804
805         if (path_is_mount_point(p, false) > 0) {
806                 if (arg_link_journal != LINK_AUTO) {
807                         log_error("%s: already a mount point, refusing to use for journal", p);
808                         return -EEXIST;
809                 }
810
811                 return 0;
812         }
813
814         if (path_is_mount_point(q, false) > 0) {
815                 if (arg_link_journal != LINK_AUTO) {
816                         log_error("%s: already a mount point, refusing to use for journal", q);
817                         return -EEXIST;
818                 }
819
820                 return 0;
821         }
822
823         r = readlink_and_make_absolute(p, &d);
824         if (r >= 0) {
825                 if ((arg_link_journal == LINK_GUEST ||
826                      arg_link_journal == LINK_AUTO) &&
827                     path_equal(d, q)) {
828
829                         r = mkdir_p(q, 0755);
830                         if (r < 0)
831                                 log_warning("failed to create directory %s: %m", q);
832                         return 0;
833                 }
834
835                 if (unlink(p) < 0) {
836                         log_error("Failed to remove symlink %s: %m", p);
837                         return -errno;
838                 }
839         } else if (r == -EINVAL) {
840
841                 if (arg_link_journal == LINK_GUEST &&
842                     rmdir(p) < 0) {
843
844                         if (errno == ENOTDIR) {
845                                 log_error("%s already exists and is neither a symlink nor a directory", p);
846                                 return r;
847                         } else {
848                                 log_error("Failed to remove %s: %m", p);
849                                 return -errno;
850                         }
851                 }
852         } else if (r != -ENOENT) {
853                 log_error("readlink(%s) failed: %m", p);
854                 return r;
855         }
856
857         if (arg_link_journal == LINK_GUEST) {
858
859                 if (symlink(q, p) < 0) {
860                         log_error("Failed to symlink %s to %s: %m", q, p);
861                         return -errno;
862                 }
863
864                 r = mkdir_p(q, 0755);
865                 if (r < 0)
866                         log_warning("failed to create directory %s: %m", q);
867                 return 0;
868         }
869
870         if (arg_link_journal == LINK_HOST) {
871                 r = mkdir_p(p, 0755);
872                 if (r < 0) {
873                         log_error("Failed to create %s: %m", p);
874                         return r;
875                 }
876
877         } else if (access(p, F_OK) < 0)
878                 return 0;
879
880         if (dir_is_empty(q) == 0) {
881                 log_error("%s not empty.", q);
882                 return -ENOTEMPTY;
883         }
884
885         r = mkdir_p(q, 0755);
886         if (r < 0) {
887                 log_error("Failed to create %s: %m", q);
888                 return r;
889         }
890
891         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
892                 log_error("Failed to bind mount journal from host into guest: %m");
893                 return -errno;
894         }
895
896         return 0;
897 }
898
899 static int drop_capabilities(void) {
900         return capability_bounding_set_drop(~arg_retain, false);
901 }
902
903 static int is_os_tree(const char *path) {
904         int r;
905         char *p;
906         /* We use /bin/sh as flag file if something is an OS */
907
908         if (asprintf(&p, "%s/bin/sh", path) < 0)
909                 return -ENOMEM;
910
911         r = access(p, F_OK);
912         free(p);
913
914         return r < 0 ? 0 : 1;
915 }
916
917 static int process_pty(int master, pid_t pid, sigset_t *mask) {
918
919         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
920         size_t in_buffer_full = 0, out_buffer_full = 0;
921         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
922         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
923         int ep = -1, signal_fd = -1, r;
924         bool tried_orderly_shutdown = false;
925
926         assert(master >= 0);
927         assert(pid > 0);
928         assert(mask);
929
930         fd_nonblock(STDIN_FILENO, 1);
931         fd_nonblock(STDOUT_FILENO, 1);
932         fd_nonblock(master, 1);
933
934         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
935         if (signal_fd < 0) {
936                 log_error("signalfd(): %m");
937                 r = -errno;
938                 goto finish;
939         }
940
941         ep = epoll_create1(EPOLL_CLOEXEC);
942         if (ep < 0) {
943                 log_error("Failed to create epoll: %m");
944                 r = -errno;
945                 goto finish;
946         }
947
948         /* We read from STDIN only if this is actually a TTY,
949          * otherwise we assume non-interactivity. */
950         if (isatty(STDIN_FILENO)) {
951                 zero(stdin_ev);
952                 stdin_ev.events = EPOLLIN|EPOLLET;
953                 stdin_ev.data.fd = STDIN_FILENO;
954
955                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
956                         log_error("Failed to register STDIN in epoll: %m");
957                         r = -errno;
958                         goto finish;
959                 }
960         }
961
962         zero(stdout_ev);
963         stdout_ev.events = EPOLLOUT|EPOLLET;
964         stdout_ev.data.fd = STDOUT_FILENO;
965
966         zero(master_ev);
967         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
968         master_ev.data.fd = master;
969
970         zero(signal_ev);
971         signal_ev.events = EPOLLIN;
972         signal_ev.data.fd = signal_fd;
973
974         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
975                 if (errno != EPERM) {
976                         log_error("Failed to register stdout in epoll: %m");
977                         r = -errno;
978                         goto finish;
979                 }
980                 /* stdout without epoll support. Likely redirected to regular file. */
981                 stdout_writable = true;
982         }
983
984         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
985             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
986                 log_error("Failed to register fds in epoll: %m");
987                 r = -errno;
988                 goto finish;
989         }
990
991         for (;;) {
992                 struct epoll_event ev[16];
993                 ssize_t k;
994                 int i, nfds;
995
996                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
997                 if (nfds < 0) {
998
999                         if (errno == EINTR || errno == EAGAIN)
1000                                 continue;
1001
1002                         log_error("epoll_wait(): %m");
1003                         r = -errno;
1004                         goto finish;
1005                 }
1006
1007                 assert(nfds >= 1);
1008
1009                 for (i = 0; i < nfds; i++) {
1010                         if (ev[i].data.fd == STDIN_FILENO) {
1011
1012                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1013                                         stdin_readable = true;
1014
1015                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1016
1017                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1018                                         stdout_writable = true;
1019
1020                         } else if (ev[i].data.fd == master) {
1021
1022                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1023                                         master_readable = true;
1024
1025                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1026                                         master_writable = true;
1027
1028                         } else if (ev[i].data.fd == signal_fd) {
1029                                 struct signalfd_siginfo sfsi;
1030                                 ssize_t n;
1031
1032                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1033                                 if (n != sizeof(sfsi)) {
1034
1035                                         if (n >= 0) {
1036                                                 log_error("Failed to read from signalfd: invalid block size");
1037                                                 r = -EIO;
1038                                                 goto finish;
1039                                         }
1040
1041                                         if (errno != EINTR && errno != EAGAIN) {
1042                                                 log_error("Failed to read from signalfd: %m");
1043                                                 r = -errno;
1044                                                 goto finish;
1045                                         }
1046                                 } else {
1047
1048                                         if (sfsi.ssi_signo == SIGWINCH) {
1049                                                 struct winsize ws;
1050
1051                                                 /* The window size changed, let's forward that. */
1052                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1053                                                         ioctl(master, TIOCSWINSZ, &ws);
1054                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1055
1056                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1057
1058                                                 /* This only works for systemd... */
1059                                                 tried_orderly_shutdown = true;
1060                                                 kill(pid, SIGRTMIN+3);
1061
1062                                         } else {
1063                                                 r = 0;
1064                                                 goto finish;
1065                                         }
1066                                 }
1067                         }
1068                 }
1069
1070                 while ((stdin_readable && in_buffer_full <= 0) ||
1071                        (master_writable && in_buffer_full > 0) ||
1072                        (master_readable && out_buffer_full <= 0) ||
1073                        (stdout_writable && out_buffer_full > 0)) {
1074
1075                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1076
1077                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1078                                 if (k < 0) {
1079
1080                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1081                                                 stdin_readable = false;
1082                                         else {
1083                                                 log_error("read(): %m");
1084                                                 r = -errno;
1085                                                 goto finish;
1086                                         }
1087                                 } else
1088                                         in_buffer_full += (size_t) k;
1089                         }
1090
1091                         if (master_writable && in_buffer_full > 0) {
1092
1093                                 k = write(master, in_buffer, in_buffer_full);
1094                                 if (k < 0) {
1095
1096                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1097                                                 master_writable = false;
1098                                         else {
1099                                                 log_error("write(): %m");
1100                                                 r = -errno;
1101                                                 goto finish;
1102                                         }
1103
1104                                 } else {
1105                                         assert(in_buffer_full >= (size_t) k);
1106                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1107                                         in_buffer_full -= k;
1108                                 }
1109                         }
1110
1111                         if (master_readable && out_buffer_full < LINE_MAX) {
1112
1113                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1114                                 if (k < 0) {
1115
1116                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1117                                                 master_readable = false;
1118                                         else {
1119                                                 log_error("read(): %m");
1120                                                 r = -errno;
1121                                                 goto finish;
1122                                         }
1123                                 }  else
1124                                         out_buffer_full += (size_t) k;
1125                         }
1126
1127                         if (stdout_writable && out_buffer_full > 0) {
1128
1129                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1130                                 if (k < 0) {
1131
1132                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1133                                                 stdout_writable = false;
1134                                         else {
1135                                                 log_error("write(): %m");
1136                                                 r = -errno;
1137                                                 goto finish;
1138                                         }
1139
1140                                 } else {
1141                                         assert(out_buffer_full >= (size_t) k);
1142                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1143                                         out_buffer_full -= k;
1144                                 }
1145                         }
1146                 }
1147         }
1148
1149 finish:
1150         if (ep >= 0)
1151                 close_nointr_nofail(ep);
1152
1153         if (signal_fd >= 0)
1154                 close_nointr_nofail(signal_fd);
1155
1156         return r;
1157 }
1158
1159 int main(int argc, char *argv[]) {
1160         pid_t pid = 0;
1161         int r = EXIT_FAILURE, k;
1162         char *oldcg = NULL, *newcg = NULL;
1163         char **controller = NULL;
1164         int master = -1, n_fd_passed;
1165         const char *console = NULL;
1166         struct termios saved_attr, raw_attr;
1167         sigset_t mask;
1168         bool saved_attr_valid = false;
1169         struct winsize ws;
1170         int kmsg_socket_pair[2] = { -1, -1 };
1171         FDSet *fds = NULL;
1172
1173         log_parse_environment();
1174         log_open();
1175
1176         r = parse_argv(argc, argv);
1177         if (r <= 0)
1178                 goto finish;
1179
1180         if (arg_directory) {
1181                 char *p;
1182
1183                 p = path_make_absolute_cwd(arg_directory);
1184                 free(arg_directory);
1185                 arg_directory = p;
1186         } else
1187                 arg_directory = get_current_dir_name();
1188
1189         if (!arg_directory) {
1190                 log_error("Failed to determine path");
1191                 goto finish;
1192         }
1193
1194         path_kill_slashes(arg_directory);
1195
1196         if (geteuid() != 0) {
1197                 log_error("Need to be root.");
1198                 goto finish;
1199         }
1200
1201         if (sd_booted() <= 0) {
1202                 log_error("Not running on a systemd system.");
1203                 goto finish;
1204         }
1205
1206         if (path_equal(arg_directory, "/")) {
1207                 log_error("Spawning container on root directory not supported.");
1208                 goto finish;
1209         }
1210
1211         if (is_os_tree(arg_directory) <= 0) {
1212                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1213                 goto finish;
1214         }
1215
1216         log_close();
1217         n_fd_passed = sd_listen_fds(false);
1218         if (n_fd_passed > 0) {
1219                 k = fdset_new_listen_fds(&fds, false);
1220                 if (k < 0) {
1221                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1222                         goto finish;
1223                 }
1224         }
1225         fdset_close_others(fds);
1226         log_open();
1227
1228         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1229         if (k < 0) {
1230                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1231                 goto finish;
1232         }
1233
1234         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1235                 log_error("Failed to allocate cgroup path.");
1236                 goto finish;
1237         }
1238
1239         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1240         if (k < 0)  {
1241                 log_error("Failed to create cgroup: %s", strerror(-k));
1242                 goto finish;
1243         }
1244
1245         STRV_FOREACH(controller, arg_controllers) {
1246                 k = cg_create_and_attach(*controller, newcg, 0);
1247                 if (k < 0)
1248                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1249         }
1250
1251         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1252         if (master < 0) {
1253                 log_error("Failed to acquire pseudo tty: %m");
1254                 goto finish;
1255         }
1256
1257         console = ptsname(master);
1258         if (!console) {
1259                 log_error("Failed to determine tty name: %m");
1260                 goto finish;
1261         }
1262
1263         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1264
1265         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1266                 ioctl(master, TIOCSWINSZ, &ws);
1267
1268         if (unlockpt(master) < 0) {
1269                 log_error("Failed to unlock tty: %m");
1270                 goto finish;
1271         }
1272
1273         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1274                 saved_attr_valid = true;
1275
1276                 raw_attr = saved_attr;
1277                 cfmakeraw(&raw_attr);
1278                 raw_attr.c_lflag &= ~ECHO;
1279         }
1280
1281         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1282                 log_error("Failed to create kmsg socket pair");
1283                 goto finish;
1284         }
1285
1286         assert_se(sigemptyset(&mask) == 0);
1287         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1288         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1289
1290         for (;;) {
1291                 siginfo_t status;
1292                 int pipefd[2];
1293
1294                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1295                         log_error("pipe2(): %m");
1296                         goto finish;
1297                 }
1298
1299                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1300                 if (pid < 0) {
1301                         if (errno == EINVAL)
1302                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1303                         else
1304                                 log_error("clone() failed: %m");
1305
1306                         goto finish;
1307                 }
1308
1309                 if (pid == 0) {
1310                         /* child */
1311                         const char *home = NULL;
1312                         uid_t uid = (uid_t) -1;
1313                         gid_t gid = (gid_t) -1;
1314                         unsigned n_env = 2;
1315                         const char *envp[] = {
1316                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1317                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1318                                 NULL, /* TERM */
1319                                 NULL, /* HOME */
1320                                 NULL, /* USER */
1321                                 NULL, /* LOGNAME */
1322                                 NULL, /* container_uuid */
1323                                 NULL, /* LISTEN_FDS */
1324                                 NULL, /* LISTEN_PID */
1325                                 NULL
1326                         };
1327
1328                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1329                         if (envp[n_env])
1330                                 n_env ++;
1331
1332                         close_nointr_nofail(pipefd[1]);
1333                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1334                         close_nointr_nofail(pipefd[0]);
1335
1336                         close_nointr_nofail(master);
1337                         master = -1;
1338
1339                         if (saved_attr_valid) {
1340                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1341                                         log_error("Failed to set terminal attributes: %m");
1342                                         goto child_fail;
1343                                 }
1344                         }
1345
1346                         close_nointr(STDIN_FILENO);
1347                         close_nointr(STDOUT_FILENO);
1348                         close_nointr(STDERR_FILENO);
1349
1350                         close_nointr_nofail(kmsg_socket_pair[0]);
1351                         kmsg_socket_pair[0] = -1;
1352
1353                         reset_all_signal_handlers();
1354
1355                         assert_se(sigemptyset(&mask) == 0);
1356                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1357
1358                         k = open_terminal(console, O_RDWR);
1359                         if (k != STDIN_FILENO) {
1360                                 if (k >= 0) {
1361                                         close_nointr_nofail(k);
1362                                         k = -EINVAL;
1363                                 }
1364
1365                                 log_error("Failed to open console: %s", strerror(-k));
1366                                 goto child_fail;
1367                         }
1368
1369                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1370                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1371                                 log_error("Failed to duplicate console: %m");
1372                                 goto child_fail;
1373                         }
1374
1375                         if (setsid() < 0) {
1376                                 log_error("setsid() failed: %m");
1377                                 goto child_fail;
1378                         }
1379
1380                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1381                                 log_error("PR_SET_PDEATHSIG failed: %m");
1382                                 goto child_fail;
1383                         }
1384
1385                         /* Mark everything as slave, so that we still
1386                          * receive mounts from the real root, but don't
1387                          * propagate mounts to the real root. */
1388                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1389                                 log_error("MS_SLAVE|MS_REC failed: %m");
1390                                 goto child_fail;
1391                         }
1392
1393                         /* Turn directory into bind mount */
1394                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1395                                 log_error("Failed to make bind mount.");
1396                                 goto child_fail;
1397                         }
1398
1399                         if (arg_read_only)
1400                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1401                                         log_error("Failed to make read-only.");
1402                                         goto child_fail;
1403                                 }
1404
1405                         if (mount_all(arg_directory) < 0)
1406                                 goto child_fail;
1407
1408                         if (copy_devnodes(arg_directory) < 0)
1409                                 goto child_fail;
1410
1411                         if (setup_ptmx(arg_directory) < 0)
1412                                 goto child_fail;
1413
1414                         dev_setup(arg_directory);
1415
1416                         if (setup_dev_console(arg_directory, console) < 0)
1417                                 goto child_fail;
1418
1419                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1420                                 goto child_fail;
1421
1422                         close_nointr_nofail(kmsg_socket_pair[1]);
1423                         kmsg_socket_pair[1] = -1;
1424
1425                         if (setup_boot_id(arg_directory) < 0)
1426                                 goto child_fail;
1427
1428                         if (setup_timezone(arg_directory) < 0)
1429                                 goto child_fail;
1430
1431                         if (setup_resolv_conf(arg_directory) < 0)
1432                                 goto child_fail;
1433
1434                         if (setup_journal(arg_directory) < 0)
1435                                 goto child_fail;
1436
1437                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1438                                 goto child_fail;
1439
1440                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1441                                 goto child_fail;
1442
1443                         if (chdir(arg_directory) < 0) {
1444                                 log_error("chdir(%s) failed: %m", arg_directory);
1445                                 goto child_fail;
1446                         }
1447
1448                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1449                                 log_error("mount(MS_MOVE) failed: %m");
1450                                 goto child_fail;
1451                         }
1452
1453                         if (chroot(".") < 0) {
1454                                 log_error("chroot() failed: %m");
1455                                 goto child_fail;
1456                         }
1457
1458                         if (chdir("/") < 0) {
1459                                 log_error("chdir() failed: %m");
1460                                 goto child_fail;
1461                         }
1462
1463                         umask(0022);
1464
1465                         loopback_setup();
1466
1467                         if (drop_capabilities() < 0) {
1468                                 log_error("drop_capabilities() failed: %m");
1469                                 goto child_fail;
1470                         }
1471
1472                         if (arg_user) {
1473
1474                                 /* Note that this resolves user names
1475                                  * inside the container, and hence
1476                                  * accesses the NSS modules from the
1477                                  * container and not the host. This is
1478                                  * a bit weird... */
1479
1480                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1481                                         log_error("get_user_creds() failed: %m");
1482                                         goto child_fail;
1483                                 }
1484
1485                                 if (mkdir_parents_label(home, 0775) < 0) {
1486                                         log_error("mkdir_parents_label() failed: %m");
1487                                         goto child_fail;
1488                                 }
1489
1490                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1491                                         log_error("mkdir_safe_label() failed: %m");
1492                                         goto child_fail;
1493                                 }
1494
1495                                 if (initgroups((const char*)arg_user, gid) < 0) {
1496                                         log_error("initgroups() failed: %m");
1497                                         goto child_fail;
1498                                 }
1499
1500                                 if (setresgid(gid, gid, gid) < 0) {
1501                                         log_error("setregid() failed: %m");
1502                                         goto child_fail;
1503                                 }
1504
1505                                 if (setresuid(uid, uid, uid) < 0) {
1506                                         log_error("setreuid() failed: %m");
1507                                         goto child_fail;
1508                                 }
1509                         } else {
1510                                 /* Reset everything fully to 0, just in case */
1511
1512                                 if (setgroups(0, NULL) < 0) {
1513                                         log_error("setgroups() failed: %m");
1514                                         goto child_fail;
1515                                 }
1516
1517                                 if (setresgid(0, 0, 0) < 0) {
1518                                         log_error("setregid() failed: %m");
1519                                         goto child_fail;
1520                                 }
1521
1522                                 if (setresuid(0, 0, 0) < 0) {
1523                                         log_error("setreuid() failed: %m");
1524                                         goto child_fail;
1525                                 }
1526                         }
1527
1528                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1529                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1530                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1531                                 log_oom();
1532                                 goto child_fail;
1533                         }
1534
1535                         if (arg_uuid) {
1536                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1537                                         log_oom();
1538                                         goto child_fail;
1539                                 }
1540                         }
1541
1542                         if (fdset_size(fds) > 0) {
1543                                 k = fdset_cloexec(fds, false);
1544                                 if (k < 0) {
1545                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1546                                         goto child_fail;
1547                                 }
1548
1549                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1550                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) getpid()) < 0)) {
1551                                         log_oom();
1552                                         goto child_fail;
1553                                 }
1554                         }
1555
1556                         setup_hostname();
1557
1558                         if (arg_boot) {
1559                                 char **a;
1560                                 size_t l;
1561
1562                                 /* Automatically search for the init system */
1563
1564                                 l = 1 + argc - optind;
1565                                 a = newa(char*, l + 1);
1566                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1567
1568                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1569                                 execve(a[0], a, (char**) envp);
1570
1571                                 a[0] = (char*) "/lib/systemd/systemd";
1572                                 execve(a[0], a, (char**) envp);
1573
1574                                 a[0] = (char*) "/sbin/init";
1575                                 execve(a[0], a, (char**) envp);
1576                         } else if (argc > optind)
1577                                 execvpe(argv[optind], argv + optind, (char**) envp);
1578                         else {
1579                                 chdir(home ? home : "/root");
1580                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1581                         }
1582
1583                         log_error("execv() failed: %m");
1584
1585                 child_fail:
1586                         _exit(EXIT_FAILURE);
1587                 }
1588
1589                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1590                 close_nointr_nofail(pipefd[0]);
1591                 close_nointr_nofail(pipefd[1]);
1592
1593                 fdset_free(fds);
1594                 fds = NULL;
1595
1596                 if (process_pty(master, pid, &mask) < 0)
1597                         goto finish;
1598
1599                 if (saved_attr_valid)
1600                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1601
1602                 r = wait_for_terminate(pid, &status);
1603                 if (r < 0) {
1604                         r = EXIT_FAILURE;
1605                         break;
1606                 }
1607
1608                 if (status.si_code == CLD_EXITED) {
1609                         if (status.si_status != 0) {
1610                                 log_error("Container failed with error code %i.", status.si_status);
1611                                 r = status.si_status;
1612                                 break;
1613                         }
1614
1615                         log_debug("Container exited successfully.");
1616                         break;
1617                 } else if (status.si_code == CLD_KILLED &&
1618                            status.si_status == SIGINT) {
1619                         log_info("Container has been shut down.");
1620                         r = 0;
1621                         break;
1622                 } else if (status.si_code == CLD_KILLED &&
1623                            status.si_status == SIGHUP) {
1624                         log_info("Container is being rebooted.");
1625                         continue;
1626                 } else if (status.si_code == CLD_KILLED ||
1627                            status.si_code == CLD_DUMPED) {
1628
1629                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1630                         r = EXIT_FAILURE;
1631                         break;
1632                 } else {
1633                         log_error("Container failed due to unknown reason.");
1634                         r = EXIT_FAILURE;
1635                         break;
1636                 }
1637         }
1638
1639 finish:
1640         if (saved_attr_valid)
1641                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1642
1643         if (master >= 0)
1644                 close_nointr_nofail(master);
1645
1646         close_pipe(kmsg_socket_pair);
1647
1648         if (oldcg)
1649                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1650
1651         if (newcg)
1652                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1653
1654         free(arg_directory);
1655         strv_free(arg_controllers);
1656         free(oldcg);
1657         free(newcg);
1658
1659         fdset_free(fds);
1660
1661         return r;
1662 }