chiark / gitweb /
e907a1daa7f31549d45bca1747fc4523e3990780
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <attr/xattr.h>
26 #include <sys/types.h>
27 #include <sys/syscall.h>
28 #include <sys/mount.h>
29 #include <sys/wait.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <stdio.h>
33 #include <errno.h>
34 #include <sys/prctl.h>
35 #include <sys/capability.h>
36 #include <getopt.h>
37 #include <sys/poll.h>
38 #include <sys/epoll.h>
39 #include <termios.h>
40 #include <sys/signalfd.h>
41 #include <grp.h>
42 #include <linux/fs.h>
43 #include <sys/un.h>
44 #include <sys/socket.h>
45
46 #include <systemd/sd-daemon.h>
47
48 #include "log.h"
49 #include "util.h"
50 #include "mkdir.h"
51 #include "macro.h"
52 #include "audit.h"
53 #include "missing.h"
54 #include "cgroup-util.h"
55 #include "strv.h"
56 #include "path-util.h"
57 #include "loopback-setup.h"
58 #include "sd-id128.h"
59 #include "dev-setup.h"
60 #include "fdset.h"
61 #include "build.h"
62 #include "fileio.h"
63
64 #ifndef TTY_GID
65 #define TTY_GID 5
66 #endif
67
68 typedef enum LinkJournal {
69         LINK_NO,
70         LINK_AUTO,
71         LINK_HOST,
72         LINK_GUEST
73 } LinkJournal;
74
75 static char *arg_directory = NULL;
76 static char *arg_user = NULL;
77 static char **arg_controllers = NULL;
78 static char *arg_uuid = NULL;
79 static char *arg_machine = NULL;
80 static bool arg_private_network = false;
81 static bool arg_read_only = false;
82 static bool arg_boot = false;
83 static LinkJournal arg_link_journal = LINK_AUTO;
84 static uint64_t arg_retain =
85         (1ULL << CAP_CHOWN) |
86         (1ULL << CAP_DAC_OVERRIDE) |
87         (1ULL << CAP_DAC_READ_SEARCH) |
88         (1ULL << CAP_FOWNER) |
89         (1ULL << CAP_FSETID) |
90         (1ULL << CAP_IPC_OWNER) |
91         (1ULL << CAP_KILL) |
92         (1ULL << CAP_LEASE) |
93         (1ULL << CAP_LINUX_IMMUTABLE) |
94         (1ULL << CAP_NET_BIND_SERVICE) |
95         (1ULL << CAP_NET_BROADCAST) |
96         (1ULL << CAP_NET_RAW) |
97         (1ULL << CAP_SETGID) |
98         (1ULL << CAP_SETFCAP) |
99         (1ULL << CAP_SETPCAP) |
100         (1ULL << CAP_SETUID) |
101         (1ULL << CAP_SYS_ADMIN) |
102         (1ULL << CAP_SYS_CHROOT) |
103         (1ULL << CAP_SYS_NICE) |
104         (1ULL << CAP_SYS_PTRACE) |
105         (1ULL << CAP_SYS_TTY_CONFIG) |
106         (1ULL << CAP_SYS_RESOURCE) |
107         (1ULL << CAP_SYS_BOOT) |
108         (1ULL << CAP_AUDIT_WRITE) |
109         (1ULL << CAP_AUDIT_CONTROL);
110 static char **arg_bind = NULL;
111 static char **arg_bind_ro = NULL;
112
113 static int help(void) {
114
115         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
116                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
117                "  -h --help                Show this help\n"
118                "     --version             Print version string\n"
119                "  -D --directory=NAME      Root directory for the container\n"
120                "  -b --boot                Boot up full system (i.e. invoke init)\n"
121                "  -u --user=USER           Run the command under specified user or uid\n"
122                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
123                "                           cgroup hierarchies\n"
124                "     --uuid=UUID           Set a specific machine UUID for the container\n"
125                "  -M --machine=NAME        Set the machine name for the container\n"
126                "     --private-network     Disable network in container\n"
127                "     --read-only           Mount the root directory read-only\n"
128                "     --capability=CAP      In addition to the default, retain specified\n"
129                "                           capability\n"
130                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
131                "  -j                       Equivalent to --link-journal=host\n"
132                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
133                "                           the container\n"
134                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
135                program_invocation_short_name);
136
137         return 0;
138 }
139
140 static int parse_argv(int argc, char *argv[]) {
141
142         enum {
143                 ARG_VERSION = 0x100,
144                 ARG_PRIVATE_NETWORK,
145                 ARG_UUID,
146                 ARG_READ_ONLY,
147                 ARG_CAPABILITY,
148                 ARG_LINK_JOURNAL,
149                 ARG_BIND,
150                 ARG_BIND_RO
151         };
152
153         static const struct option options[] = {
154                 { "help",            no_argument,       NULL, 'h'                 },
155                 { "version",         no_argument,       NULL, ARG_VERSION         },
156                 { "directory",       required_argument, NULL, 'D'                 },
157                 { "user",            required_argument, NULL, 'u'                 },
158                 { "controllers",     required_argument, NULL, 'C'                 },
159                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
160                 { "boot",            no_argument,       NULL, 'b'                 },
161                 { "uuid",            required_argument, NULL, ARG_UUID            },
162                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
163                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
164                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
165                 { "bind",            required_argument, NULL, ARG_BIND            },
166                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
167                 { "machine",         required_argument, NULL, 'M'                 },
168                 { NULL,              0,                 NULL, 0                   }
169         };
170
171         int c;
172
173         assert(argc >= 0);
174         assert(argv);
175
176         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
177
178                 switch (c) {
179
180                 case 'h':
181                         help();
182                         return 0;
183
184                 case ARG_VERSION:
185                         puts(PACKAGE_STRING);
186                         puts(SYSTEMD_FEATURES);
187                         return 0;
188
189                 case 'D':
190                         free(arg_directory);
191                         arg_directory = canonicalize_file_name(optarg);
192                         if (!arg_directory) {
193                                 log_error("Failed to canonicalize root directory.");
194                                 return -ENOMEM;
195                         }
196
197                         break;
198
199                 case 'u':
200                         free(arg_user);
201                         arg_user = strdup(optarg);
202                         if (!arg_user)
203                                 return log_oom();
204
205                         break;
206
207                 case 'C':
208                         strv_free(arg_controllers);
209                         arg_controllers = strv_split(optarg, ",");
210                         if (!arg_controllers)
211                                 return log_oom();
212
213                         cg_shorten_controllers(arg_controllers);
214                         break;
215
216                 case ARG_PRIVATE_NETWORK:
217                         arg_private_network = true;
218                         break;
219
220                 case 'b':
221                         arg_boot = true;
222                         break;
223
224                 case ARG_UUID:
225                         if (!id128_is_valid(optarg)) {
226                                 log_error("Invalid UUID: %s", optarg);
227                                 return -EINVAL;
228                         }
229
230                         arg_uuid = optarg;
231                         break;
232
233                 case 'M':
234                         if (!hostname_is_valid(optarg)) {
235                                 log_error("Invalid machine name: %s", optarg);
236                                 return -EINVAL;
237                         }
238
239                         free(arg_machine);
240                         arg_machine = strdup(optarg);
241                         if (!arg_machine)
242                                 return log_oom();
243
244                         break;
245
246                 case ARG_READ_ONLY:
247                         arg_read_only = true;
248                         break;
249
250                 case ARG_CAPABILITY: {
251                         char *state, *word;
252                         size_t length;
253
254                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
255                                 cap_value_t cap;
256                                 char *t;
257
258                                 t = strndup(word, length);
259                                 if (!t)
260                                         return log_oom();
261
262                                 if (cap_from_name(t, &cap) < 0) {
263                                         log_error("Failed to parse capability %s.", t);
264                                         free(t);
265                                         return -EINVAL;
266                                 }
267
268                                 free(t);
269                                 arg_retain |= 1ULL << (uint64_t) cap;
270                         }
271
272                         break;
273                 }
274
275                 case 'j':
276                         arg_link_journal = LINK_GUEST;
277                         break;
278
279                 case ARG_LINK_JOURNAL:
280                         if (streq(optarg, "auto"))
281                                 arg_link_journal = LINK_AUTO;
282                         else if (streq(optarg, "no"))
283                                 arg_link_journal = LINK_NO;
284                         else if (streq(optarg, "guest"))
285                                 arg_link_journal = LINK_GUEST;
286                         else if (streq(optarg, "host"))
287                                 arg_link_journal = LINK_HOST;
288                         else {
289                                 log_error("Failed to parse link journal mode %s", optarg);
290                                 return -EINVAL;
291                         }
292
293                         break;
294
295                 case ARG_BIND:
296                 case ARG_BIND_RO: {
297                         _cleanup_free_ char *a = NULL, *b = NULL;
298                         char *e;
299                         char ***x;
300                         int r;
301
302                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
303
304                         e = strchr(optarg, ':');
305                         if (e) {
306                                 a = strndup(optarg, e - optarg);
307                                 b = strdup(e + 1);
308                         } else {
309                                 a = strdup(optarg);
310                                 b = strdup(optarg);
311                         }
312
313                         if (!a || !b)
314                                 return log_oom();
315
316                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
317                                 log_error("Invalid bind mount specification: %s", optarg);
318                                 return -EINVAL;
319                         }
320
321                         r = strv_extend(x, a);
322                         if (r < 0)
323                                 return r;
324
325                         r = strv_extend(x, b);
326                         if (r < 0)
327                                 return r;
328
329                         break;
330                 }
331
332                 case '?':
333                         return -EINVAL;
334
335                 default:
336                         log_error("Unknown option code %c", c);
337                         return -EINVAL;
338                 }
339         }
340
341         return 1;
342 }
343
344 static int mount_all(const char *dest) {
345
346         typedef struct MountPoint {
347                 const char *what;
348                 const char *where;
349                 const char *type;
350                 const char *options;
351                 unsigned long flags;
352                 bool fatal;
353         } MountPoint;
354
355         static const MountPoint mount_table[] = {
356                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
357                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
358                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
359                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
361                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
362                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
363                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
364 #ifdef HAVE_SELINUX
365                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
366                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
367 #endif
368         };
369
370         unsigned k;
371         int r = 0;
372
373         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
374                 _cleanup_free_ char *where = NULL;
375                 int t;
376
377                 where = strjoin(dest, "/", mount_table[k].where, NULL);
378                 if (!where)
379                         return log_oom();
380
381                 t = path_is_mount_point(where, true);
382                 if (t < 0) {
383                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
384
385                         if (r == 0)
386                                 r = t;
387
388                         continue;
389                 }
390
391                 /* Skip this entry if it is not a remount. */
392                 if (mount_table[k].what && t > 0)
393                         continue;
394
395                 mkdir_p(where, 0755);
396
397                 if (mount(mount_table[k].what,
398                           where,
399                           mount_table[k].type,
400                           mount_table[k].flags,
401                           mount_table[k].options) < 0 &&
402                     mount_table[k].fatal) {
403
404                         log_error("mount(%s) failed: %m", where);
405
406                         if (r == 0)
407                                 r = -errno;
408                 }
409         }
410
411         return r;
412 }
413
414 static int mount_binds(const char *dest, char **l, unsigned long flags) {
415         char **x, **y;
416
417         STRV_FOREACH_PAIR(x, y, l) {
418                 _cleanup_free_ char *where = NULL;
419
420                 where = strjoin(dest, "/", *y, NULL);
421                 if (!where)
422                         return log_oom();
423
424                 mkdir_p_label(where, 0755);
425
426                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
427                         log_error("mount(%s) failed: %m", where);
428                         return -errno;
429                 }
430
431                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
432                         log_error("mount(%s) failed: %m", where);
433                         return -errno;
434                 }
435         }
436
437         return 0;
438 }
439
440 static int setup_timezone(const char *dest) {
441         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
442         char *z, *y;
443         int r;
444
445         assert(dest);
446
447         /* Fix the timezone, if possible */
448         r = readlink_malloc("/etc/localtime", &p);
449         if (r < 0) {
450                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
451                 return 0;
452         }
453
454         z = path_startswith(p, "../usr/share/zoneinfo/");
455         if (!z)
456                 z = path_startswith(p, "/usr/share/zoneinfo/");
457         if (!z) {
458                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
459                 return 0;
460         }
461
462         where = strappend(dest, "/etc/localtime");
463         if (!where)
464                 return log_oom();
465
466         r = readlink_malloc(where, &q);
467         if (r >= 0) {
468                 y = path_startswith(q, "../usr/share/zoneinfo/");
469                 if (!y)
470                         y = path_startswith(q, "/usr/share/zoneinfo/");
471
472
473                 /* Already pointing to the right place? Then do nothing .. */
474                 if (y && streq(y, z))
475                         return 0;
476         }
477
478         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
479         if (!check)
480                 return log_oom();
481
482         if (access(check, F_OK) < 0) {
483                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
484                 return 0;
485         }
486
487         what = strappend("../usr/share/zoneinfo/", z);
488         if (!what)
489                 return log_oom();
490
491         unlink(where);
492         if (symlink(what, where) < 0) {
493                 log_error("Failed to correct timezone of container: %m");
494                 return 0;
495         }
496
497         return 0;
498 }
499
500 static int setup_resolv_conf(const char *dest) {
501         char _cleanup_free_ *where = NULL;
502         _cleanup_close_ int fd = -1;
503
504         assert(dest);
505
506         if (arg_private_network)
507                 return 0;
508
509         /* Fix resolv.conf, if possible */
510         where = strappend(dest, "/etc/resolv.conf");
511         if (!where)
512                 return log_oom();
513
514         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
515
516         /* We don't really care for the results of this really. If it
517          * fails, it fails, but meh... */
518         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
519                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
520         else
521                 if (mount("/etc/resolv.conf", where, "bind",
522                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
523                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
524                         return -errno;
525                 }
526
527         return 0;
528 }
529
530 static int setup_boot_id(const char *dest) {
531         _cleanup_free_ char *from = NULL, *to = NULL;
532         sd_id128_t rnd;
533         char as_uuid[37];
534         int r;
535
536         assert(dest);
537
538         /* Generate a new randomized boot ID, so that each boot-up of
539          * the container gets a new one */
540
541         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
542         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
543         if (!from || !to)
544                 return log_oom();
545
546         r = sd_id128_randomize(&rnd);
547         if (r < 0) {
548                 log_error("Failed to generate random boot id: %s", strerror(-r));
549                 return r;
550         }
551
552         snprintf(as_uuid, sizeof(as_uuid),
553                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
554                  SD_ID128_FORMAT_VAL(rnd));
555         char_array_0(as_uuid);
556
557         r = write_string_file(from, as_uuid);
558         if (r < 0) {
559                 log_error("Failed to write boot id: %s", strerror(-r));
560                 return r;
561         }
562
563         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
564                 log_error("Failed to bind mount boot id: %m");
565                 r = -errno;
566         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
567                 log_warning("Failed to make boot id read-only: %m");
568
569         unlink(from);
570         return r;
571 }
572
573 static int copy_devnodes(const char *dest) {
574
575         static const char devnodes[] =
576                 "null\0"
577                 "zero\0"
578                 "full\0"
579                 "random\0"
580                 "urandom\0"
581                 "tty\0";
582
583         const char *d;
584         int r = 0;
585         _cleanup_umask_ mode_t u;
586
587         assert(dest);
588
589         u = umask(0000);
590
591         NULSTR_FOREACH(d, devnodes) {
592                 struct stat st;
593                 _cleanup_free_ char *from = NULL, *to = NULL;
594
595                 asprintf(&from, "/dev/%s", d);
596                 asprintf(&to, "%s/dev/%s", dest, d);
597
598                 if (!from || !to) {
599                         log_oom();
600
601                         if (r == 0)
602                                 r = -ENOMEM;
603
604                         break;
605                 }
606
607                 if (stat(from, &st) < 0) {
608
609                         if (errno != ENOENT) {
610                                 log_error("Failed to stat %s: %m", from);
611                                 if (r == 0)
612                                         r = -errno;
613                         }
614
615                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
616
617                         log_error("%s is not a char or block device, cannot copy", from);
618                         if (r == 0)
619                                 r = -EIO;
620
621                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
622
623                         log_error("mknod(%s) failed: %m", dest);
624                         if (r == 0)
625                                 r = -errno;
626                 }
627         }
628
629         return r;
630 }
631
632 static int setup_ptmx(const char *dest) {
633         _cleanup_free_ char *p = NULL;
634
635         p = strappend(dest, "/dev/ptmx");
636         if (!p)
637                 return log_oom();
638
639         if (symlink("pts/ptmx", p) < 0) {
640                 log_error("Failed to create /dev/ptmx symlink: %m");
641                 return -errno;
642         }
643
644         return 0;
645 }
646
647 static int setup_dev_console(const char *dest, const char *console) {
648         struct stat st;
649         _cleanup_free_ char *to = NULL;
650         int r;
651         _cleanup_umask_ mode_t u;
652
653         assert(dest);
654         assert(console);
655
656         u = umask(0000);
657
658         if (stat(console, &st) < 0) {
659                 log_error("Failed to stat %s: %m", console);
660                 return -errno;
661
662         } else if (!S_ISCHR(st.st_mode)) {
663                 log_error("/dev/console is not a char device");
664                 return -EIO;
665         }
666
667         r = chmod_and_chown(console, 0600, 0, 0);
668         if (r < 0) {
669                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
670                 return r;
671         }
672
673         if (asprintf(&to, "%s/dev/console", dest) < 0)
674                 return log_oom();
675
676         /* We need to bind mount the right tty to /dev/console since
677          * ptys can only exist on pts file systems. To have something
678          * to bind mount things on we create a device node first, that
679          * has the right major/minor (note that the major minor
680          * doesn't actually matter here, since we mount it over
681          * anyway). */
682
683         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
684                 log_error("mknod() for /dev/console failed: %m");
685                 return -errno;
686         }
687
688         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
689                 log_error("Bind mount for /dev/console failed: %m");
690                 return -errno;
691         }
692
693         return 0;
694 }
695
696 static int setup_kmsg(const char *dest, int kmsg_socket) {
697         _cleanup_free_ char *from = NULL, *to = NULL;
698         int r, fd, k;
699         _cleanup_umask_ mode_t u;
700         union {
701                 struct cmsghdr cmsghdr;
702                 uint8_t buf[CMSG_SPACE(sizeof(int))];
703         } control = {};
704         struct msghdr mh = {
705                 .msg_control = &control,
706                 .msg_controllen = sizeof(control),
707         };
708         struct cmsghdr *cmsg;
709
710         assert(dest);
711         assert(kmsg_socket >= 0);
712
713         u = umask(0000);
714
715         /* We create the kmsg FIFO as /dev/kmsg, but immediately
716          * delete it after bind mounting it to /proc/kmsg. While FIFOs
717          * on the reading side behave very similar to /proc/kmsg,
718          * their writing side behaves differently from /dev/kmsg in
719          * that writing blocks when nothing is reading. In order to
720          * avoid any problems with containers deadlocking due to this
721          * we simply make /dev/kmsg unavailable to the container. */
722         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
723             asprintf(&to, "%s/proc/kmsg", dest) < 0)
724                 return log_oom();
725
726         if (mkfifo(from, 0600) < 0) {
727                 log_error("mkfifo() for /dev/kmsg failed: %m");
728                 return -errno;
729         }
730
731         r = chmod_and_chown(from, 0600, 0, 0);
732         if (r < 0) {
733                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
734                 return r;
735         }
736
737         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
738                 log_error("Bind mount for /proc/kmsg failed: %m");
739                 return -errno;
740         }
741
742         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
743         if (fd < 0) {
744                 log_error("Failed to open fifo: %m");
745                 return -errno;
746         }
747
748         cmsg = CMSG_FIRSTHDR(&mh);
749         cmsg->cmsg_level = SOL_SOCKET;
750         cmsg->cmsg_type = SCM_RIGHTS;
751         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
752         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
753
754         mh.msg_controllen = cmsg->cmsg_len;
755
756         /* Store away the fd in the socket, so that it stays open as
757          * long as we run the child */
758         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
759         close_nointr_nofail(fd);
760
761         if (k < 0) {
762                 log_error("Failed to send FIFO fd: %m");
763                 return -errno;
764         }
765
766         /* And now make the FIFO unavailable as /dev/kmsg... */
767         unlink(from);
768         return 0;
769 }
770
771 static int setup_hostname(void) {
772
773         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
774                 return -errno;
775
776         return 0;
777 }
778
779 static int setup_journal(const char *directory) {
780         sd_id128_t machine_id;
781         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
782         char *id;
783         int r;
784
785         if (arg_link_journal == LINK_NO)
786                 return 0;
787
788         p = strappend(directory, "/etc/machine-id");
789         if (!p)
790                 return log_oom();
791
792         r = read_one_line_file(p, &b);
793         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
794                 return 0;
795         else if (r < 0) {
796                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
797                 return r;
798         }
799
800         id = strstrip(b);
801         if (isempty(id) && arg_link_journal == LINK_AUTO)
802                 return 0;
803
804         /* Verify validity */
805         r = sd_id128_from_string(id, &machine_id);
806         if (r < 0) {
807                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
808                 return r;
809         }
810
811         free(p);
812         p = strappend("/var/log/journal/", id);
813         q = strjoin(directory, "/var/log/journal/", id, NULL);
814         if (!p || !q)
815                 return log_oom();
816
817         if (path_is_mount_point(p, false) > 0) {
818                 if (arg_link_journal != LINK_AUTO) {
819                         log_error("%s: already a mount point, refusing to use for journal", p);
820                         return -EEXIST;
821                 }
822
823                 return 0;
824         }
825
826         if (path_is_mount_point(q, false) > 0) {
827                 if (arg_link_journal != LINK_AUTO) {
828                         log_error("%s: already a mount point, refusing to use for journal", q);
829                         return -EEXIST;
830                 }
831
832                 return 0;
833         }
834
835         r = readlink_and_make_absolute(p, &d);
836         if (r >= 0) {
837                 if ((arg_link_journal == LINK_GUEST ||
838                      arg_link_journal == LINK_AUTO) &&
839                     path_equal(d, q)) {
840
841                         r = mkdir_p(q, 0755);
842                         if (r < 0)
843                                 log_warning("failed to create directory %s: %m", q);
844                         return 0;
845                 }
846
847                 if (unlink(p) < 0) {
848                         log_error("Failed to remove symlink %s: %m", p);
849                         return -errno;
850                 }
851         } else if (r == -EINVAL) {
852
853                 if (arg_link_journal == LINK_GUEST &&
854                     rmdir(p) < 0) {
855
856                         if (errno == ENOTDIR) {
857                                 log_error("%s already exists and is neither a symlink nor a directory", p);
858                                 return r;
859                         } else {
860                                 log_error("Failed to remove %s: %m", p);
861                                 return -errno;
862                         }
863                 }
864         } else if (r != -ENOENT) {
865                 log_error("readlink(%s) failed: %m", p);
866                 return r;
867         }
868
869         if (arg_link_journal == LINK_GUEST) {
870
871                 if (symlink(q, p) < 0) {
872                         log_error("Failed to symlink %s to %s: %m", q, p);
873                         return -errno;
874                 }
875
876                 r = mkdir_p(q, 0755);
877                 if (r < 0)
878                         log_warning("failed to create directory %s: %m", q);
879                 return 0;
880         }
881
882         if (arg_link_journal == LINK_HOST) {
883                 r = mkdir_p(p, 0755);
884                 if (r < 0) {
885                         log_error("Failed to create %s: %m", p);
886                         return r;
887                 }
888
889         } else if (access(p, F_OK) < 0)
890                 return 0;
891
892         if (dir_is_empty(q) == 0) {
893                 log_error("%s not empty.", q);
894                 return -ENOTEMPTY;
895         }
896
897         r = mkdir_p(q, 0755);
898         if (r < 0) {
899                 log_error("Failed to create %s: %m", q);
900                 return r;
901         }
902
903         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
904                 log_error("Failed to bind mount journal from host into guest: %m");
905                 return -errno;
906         }
907
908         return 0;
909 }
910
911 static int setup_cgroup(const char *path) {
912         char **c;
913         int r;
914
915         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
916         if (r < 0) {
917                 log_error("Failed to create cgroup: %s", strerror(-r));
918                 return r;
919         }
920
921         STRV_FOREACH(c, arg_controllers) {
922                 r = cg_create_and_attach(*c, path, 1);
923                 if (r < 0)
924                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
925         }
926
927         return 0;
928 }
929
930 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
931         _cleanup_free_ char *path = NULL;
932         char buf[DECIMAL_STR_MAX(pid_t)];
933         int r = 0, k;
934
935         assert(cgroup);
936         assert(pid >= 0);
937         assert(arg_directory);
938
939 #ifdef HAVE_XATTR
940         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
941
942         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
943         if (r < 0) {
944                 log_error("Failed to get path: %s", strerror(-r));
945                 return r;
946         }
947
948         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
949         if (r < 0)
950                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
951
952         if (uuid) {
953                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
954                 if (k < 0) {
955                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
956                         if (r == 0)
957                                 r = k;
958                 }
959         }
960
961         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
962         if (k < 0) {
963                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
964                 if (r == 0)
965                         r = k;
966         }
967 #endif
968         return r;
969 }
970
971 static int drop_capabilities(void) {
972         return capability_bounding_set_drop(~arg_retain, false);
973 }
974
975 static int process_pty(int master, pid_t pid, sigset_t *mask) {
976
977         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
978         size_t in_buffer_full = 0, out_buffer_full = 0;
979         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
980         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
981         int ep = -1, signal_fd = -1, r;
982         bool tried_orderly_shutdown = false;
983
984         assert(master >= 0);
985         assert(pid > 0);
986         assert(mask);
987
988         fd_nonblock(STDIN_FILENO, 1);
989         fd_nonblock(STDOUT_FILENO, 1);
990         fd_nonblock(master, 1);
991
992         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
993         if (signal_fd < 0) {
994                 log_error("signalfd(): %m");
995                 r = -errno;
996                 goto finish;
997         }
998
999         ep = epoll_create1(EPOLL_CLOEXEC);
1000         if (ep < 0) {
1001                 log_error("Failed to create epoll: %m");
1002                 r = -errno;
1003                 goto finish;
1004         }
1005
1006         /* We read from STDIN only if this is actually a TTY,
1007          * otherwise we assume non-interactivity. */
1008         if (isatty(STDIN_FILENO)) {
1009                 zero(stdin_ev);
1010                 stdin_ev.events = EPOLLIN|EPOLLET;
1011                 stdin_ev.data.fd = STDIN_FILENO;
1012
1013                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1014                         log_error("Failed to register STDIN in epoll: %m");
1015                         r = -errno;
1016                         goto finish;
1017                 }
1018         }
1019
1020         zero(stdout_ev);
1021         stdout_ev.events = EPOLLOUT|EPOLLET;
1022         stdout_ev.data.fd = STDOUT_FILENO;
1023
1024         zero(master_ev);
1025         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1026         master_ev.data.fd = master;
1027
1028         zero(signal_ev);
1029         signal_ev.events = EPOLLIN;
1030         signal_ev.data.fd = signal_fd;
1031
1032         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1033                 if (errno != EPERM) {
1034                         log_error("Failed to register stdout in epoll: %m");
1035                         r = -errno;
1036                         goto finish;
1037                 }
1038                 /* stdout without epoll support. Likely redirected to regular file. */
1039                 stdout_writable = true;
1040         }
1041
1042         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1043             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1044                 log_error("Failed to register fds in epoll: %m");
1045                 r = -errno;
1046                 goto finish;
1047         }
1048
1049         for (;;) {
1050                 struct epoll_event ev[16];
1051                 ssize_t k;
1052                 int i, nfds;
1053
1054                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1055                 if (nfds < 0) {
1056
1057                         if (errno == EINTR || errno == EAGAIN)
1058                                 continue;
1059
1060                         log_error("epoll_wait(): %m");
1061                         r = -errno;
1062                         goto finish;
1063                 }
1064
1065                 assert(nfds >= 1);
1066
1067                 for (i = 0; i < nfds; i++) {
1068                         if (ev[i].data.fd == STDIN_FILENO) {
1069
1070                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1071                                         stdin_readable = true;
1072
1073                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1074
1075                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1076                                         stdout_writable = true;
1077
1078                         } else if (ev[i].data.fd == master) {
1079
1080                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1081                                         master_readable = true;
1082
1083                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1084                                         master_writable = true;
1085
1086                         } else if (ev[i].data.fd == signal_fd) {
1087                                 struct signalfd_siginfo sfsi;
1088                                 ssize_t n;
1089
1090                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1091                                 if (n != sizeof(sfsi)) {
1092
1093                                         if (n >= 0) {
1094                                                 log_error("Failed to read from signalfd: invalid block size");
1095                                                 r = -EIO;
1096                                                 goto finish;
1097                                         }
1098
1099                                         if (errno != EINTR && errno != EAGAIN) {
1100                                                 log_error("Failed to read from signalfd: %m");
1101                                                 r = -errno;
1102                                                 goto finish;
1103                                         }
1104                                 } else {
1105
1106                                         if (sfsi.ssi_signo == SIGWINCH) {
1107                                                 struct winsize ws;
1108
1109                                                 /* The window size changed, let's forward that. */
1110                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1111                                                         ioctl(master, TIOCSWINSZ, &ws);
1112                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1113
1114                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1115
1116                                                 /* This only works for systemd... */
1117                                                 tried_orderly_shutdown = true;
1118                                                 kill(pid, SIGRTMIN+3);
1119
1120                                         } else {
1121                                                 r = 0;
1122                                                 goto finish;
1123                                         }
1124                                 }
1125                         }
1126                 }
1127
1128                 while ((stdin_readable && in_buffer_full <= 0) ||
1129                        (master_writable && in_buffer_full > 0) ||
1130                        (master_readable && out_buffer_full <= 0) ||
1131                        (stdout_writable && out_buffer_full > 0)) {
1132
1133                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1134
1135                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1136                                 if (k < 0) {
1137
1138                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1139                                                 stdin_readable = false;
1140                                         else {
1141                                                 log_error("read(): %m");
1142                                                 r = -errno;
1143                                                 goto finish;
1144                                         }
1145                                 } else
1146                                         in_buffer_full += (size_t) k;
1147                         }
1148
1149                         if (master_writable && in_buffer_full > 0) {
1150
1151                                 k = write(master, in_buffer, in_buffer_full);
1152                                 if (k < 0) {
1153
1154                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1155                                                 master_writable = false;
1156                                         else {
1157                                                 log_error("write(): %m");
1158                                                 r = -errno;
1159                                                 goto finish;
1160                                         }
1161
1162                                 } else {
1163                                         assert(in_buffer_full >= (size_t) k);
1164                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1165                                         in_buffer_full -= k;
1166                                 }
1167                         }
1168
1169                         if (master_readable && out_buffer_full < LINE_MAX) {
1170
1171                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1172                                 if (k < 0) {
1173
1174                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1175                                                 master_readable = false;
1176                                         else {
1177                                                 log_error("read(): %m");
1178                                                 r = -errno;
1179                                                 goto finish;
1180                                         }
1181                                 }  else
1182                                         out_buffer_full += (size_t) k;
1183                         }
1184
1185                         if (stdout_writable && out_buffer_full > 0) {
1186
1187                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1188                                 if (k < 0) {
1189
1190                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1191                                                 stdout_writable = false;
1192                                         else {
1193                                                 log_error("write(): %m");
1194                                                 r = -errno;
1195                                                 goto finish;
1196                                         }
1197
1198                                 } else {
1199                                         assert(out_buffer_full >= (size_t) k);
1200                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1201                                         out_buffer_full -= k;
1202                                 }
1203                         }
1204                 }
1205         }
1206
1207 finish:
1208         if (ep >= 0)
1209                 close_nointr_nofail(ep);
1210
1211         if (signal_fd >= 0)
1212                 close_nointr_nofail(signal_fd);
1213
1214         return r;
1215 }
1216
1217 int main(int argc, char *argv[]) {
1218         pid_t pid = 0;
1219         int r = EXIT_FAILURE, k;
1220         _cleanup_free_ char *newcg = NULL;
1221         _cleanup_close_ int master = -1;
1222         int n_fd_passed;
1223         const char *console = NULL;
1224         struct termios saved_attr, raw_attr;
1225         sigset_t mask;
1226         bool saved_attr_valid = false;
1227         struct winsize ws;
1228         int kmsg_socket_pair[2] = { -1, -1 };
1229         FDSet *fds = NULL;
1230
1231         log_parse_environment();
1232         log_open();
1233
1234         k = parse_argv(argc, argv);
1235         if (k < 0)
1236                 goto finish;
1237         else if (k == 0) {
1238                 r = EXIT_SUCCESS;
1239                 goto finish;
1240         }
1241
1242         if (arg_directory) {
1243                 char *p;
1244
1245                 p = path_make_absolute_cwd(arg_directory);
1246                 free(arg_directory);
1247                 arg_directory = p;
1248         } else
1249                 arg_directory = get_current_dir_name();
1250
1251         if (!arg_directory) {
1252                 log_error("Failed to determine path, please use -D.");
1253                 goto finish;
1254         }
1255
1256         path_kill_slashes(arg_directory);
1257
1258         if (!arg_machine) {
1259                 arg_machine = strdup(path_get_file_name(arg_directory));
1260                 if (!arg_machine) {
1261                         log_oom();
1262                         goto finish;
1263                 }
1264
1265                 hostname_cleanup(arg_machine);
1266                 if (isempty(arg_machine)) {
1267                         log_error("Failed to determine machine name automatically, please use -M.");
1268                         goto finish;
1269                 }
1270         }
1271
1272         if (geteuid() != 0) {
1273                 log_error("Need to be root.");
1274                 goto finish;
1275         }
1276
1277         if (sd_booted() <= 0) {
1278                 log_error("Not running on a systemd system.");
1279                 goto finish;
1280         }
1281
1282         if (path_equal(arg_directory, "/")) {
1283                 log_error("Spawning container on root directory not supported.");
1284                 goto finish;
1285         }
1286
1287         if (path_is_os_tree(arg_directory) <= 0) {
1288                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1289                 goto finish;
1290         }
1291
1292         log_close();
1293         n_fd_passed = sd_listen_fds(false);
1294         if (n_fd_passed > 0) {
1295                 k = fdset_new_listen_fds(&fds, false);
1296                 if (k < 0) {
1297                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1298                         goto finish;
1299                 }
1300         }
1301         fdset_close_others(fds);
1302         log_open();
1303
1304         k = cg_get_machine_path(arg_machine, &newcg);
1305         if (k < 0) {
1306                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1307                 goto finish;
1308         }
1309
1310         k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1311         if (k <= 0 && k != -ENOENT) {
1312                 log_error("Container already running.");
1313
1314                 free(newcg);
1315                 newcg = NULL;
1316
1317                 goto finish;
1318         }
1319
1320         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1321         if (master < 0) {
1322                 log_error("Failed to acquire pseudo tty: %m");
1323                 goto finish;
1324         }
1325
1326         console = ptsname(master);
1327         if (!console) {
1328                 log_error("Failed to determine tty name: %m");
1329                 goto finish;
1330         }
1331
1332         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1333
1334         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1335                 ioctl(master, TIOCSWINSZ, &ws);
1336
1337         if (unlockpt(master) < 0) {
1338                 log_error("Failed to unlock tty: %m");
1339                 goto finish;
1340         }
1341
1342         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1343                 saved_attr_valid = true;
1344
1345                 raw_attr = saved_attr;
1346                 cfmakeraw(&raw_attr);
1347                 raw_attr.c_lflag &= ~ECHO;
1348         }
1349
1350         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1351                 log_error("Failed to create kmsg socket pair.");
1352                 goto finish;
1353         }
1354
1355         sd_notify(0, "READY=1");
1356
1357         assert_se(sigemptyset(&mask) == 0);
1358         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1359         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1360
1361         for (;;) {
1362                 siginfo_t status;
1363                 int pipefd[2], pipefd2[2];
1364
1365                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1366                         log_error("pipe2(): %m");
1367                         goto finish;
1368                 }
1369
1370                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1371                         log_error("pipe2(): %m");
1372                         close_pipe(pipefd);
1373                         goto finish;
1374                 }
1375
1376                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1377                 if (pid < 0) {
1378                         if (errno == EINVAL)
1379                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1380                         else
1381                                 log_error("clone() failed: %m");
1382
1383                         goto finish;
1384                 }
1385
1386                 if (pid == 0) {
1387                         /* child */
1388                         const char *home = NULL;
1389                         uid_t uid = (uid_t) -1;
1390                         gid_t gid = (gid_t) -1;
1391                         unsigned n_env = 2;
1392                         const char *envp[] = {
1393                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1394                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1395                                 NULL, /* TERM */
1396                                 NULL, /* HOME */
1397                                 NULL, /* USER */
1398                                 NULL, /* LOGNAME */
1399                                 NULL, /* container_uuid */
1400                                 NULL, /* LISTEN_FDS */
1401                                 NULL, /* LISTEN_PID */
1402                                 NULL
1403                         };
1404
1405                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1406                         if (envp[n_env])
1407                                 n_env ++;
1408
1409                         /* Wait for the parent process to log our PID */
1410                         close_nointr_nofail(pipefd[1]);
1411                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1412                         close_nointr_nofail(pipefd[0]);
1413
1414                         close_nointr_nofail(master);
1415                         master = -1;
1416
1417                         if (saved_attr_valid) {
1418                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1419                                         log_error("Failed to set terminal attributes: %m");
1420                                         goto child_fail;
1421                                 }
1422                         }
1423
1424                         close_nointr(STDIN_FILENO);
1425                         close_nointr(STDOUT_FILENO);
1426                         close_nointr(STDERR_FILENO);
1427
1428                         close_nointr_nofail(kmsg_socket_pair[0]);
1429                         kmsg_socket_pair[0] = -1;
1430
1431                         reset_all_signal_handlers();
1432
1433                         assert_se(sigemptyset(&mask) == 0);
1434                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1435
1436                         k = open_terminal(console, O_RDWR);
1437                         if (k != STDIN_FILENO) {
1438                                 if (k >= 0) {
1439                                         close_nointr_nofail(k);
1440                                         k = -EINVAL;
1441                                 }
1442
1443                                 log_error("Failed to open console: %s", strerror(-k));
1444                                 goto child_fail;
1445                         }
1446
1447                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1448                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1449                                 log_error("Failed to duplicate console: %m");
1450                                 goto child_fail;
1451                         }
1452
1453                         if (setsid() < 0) {
1454                                 log_error("setsid() failed: %m");
1455                                 goto child_fail;
1456                         }
1457
1458                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1459                                 log_error("PR_SET_PDEATHSIG failed: %m");
1460                                 goto child_fail;
1461                         }
1462
1463                         if (setup_cgroup(newcg) < 0)
1464                                 goto child_fail;
1465
1466                         close_pipe(pipefd2);
1467
1468                         /* Mark everything as slave, so that we still
1469                          * receive mounts from the real root, but don't
1470                          * propagate mounts to the real root. */
1471                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1472                                 log_error("MS_SLAVE|MS_REC failed: %m");
1473                                 goto child_fail;
1474                         }
1475
1476                         /* Turn directory into bind mount */
1477                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1478                                 log_error("Failed to make bind mount.");
1479                                 goto child_fail;
1480                         }
1481
1482                         if (arg_read_only)
1483                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1484                                         log_error("Failed to make read-only.");
1485                                         goto child_fail;
1486                                 }
1487
1488                         if (mount_all(arg_directory) < 0)
1489                                 goto child_fail;
1490
1491                         if (copy_devnodes(arg_directory) < 0)
1492                                 goto child_fail;
1493
1494                         if (setup_ptmx(arg_directory) < 0)
1495                                 goto child_fail;
1496
1497                         dev_setup(arg_directory);
1498
1499                         if (setup_dev_console(arg_directory, console) < 0)
1500                                 goto child_fail;
1501
1502                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1503                                 goto child_fail;
1504
1505                         close_nointr_nofail(kmsg_socket_pair[1]);
1506                         kmsg_socket_pair[1] = -1;
1507
1508                         if (setup_boot_id(arg_directory) < 0)
1509                                 goto child_fail;
1510
1511                         if (setup_timezone(arg_directory) < 0)
1512                                 goto child_fail;
1513
1514                         if (setup_resolv_conf(arg_directory) < 0)
1515                                 goto child_fail;
1516
1517                         if (setup_journal(arg_directory) < 0)
1518                                 goto child_fail;
1519
1520                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1521                                 goto child_fail;
1522
1523                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1524                                 goto child_fail;
1525
1526                         if (chdir(arg_directory) < 0) {
1527                                 log_error("chdir(%s) failed: %m", arg_directory);
1528                                 goto child_fail;
1529                         }
1530
1531                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1532                                 log_error("mount(MS_MOVE) failed: %m");
1533                                 goto child_fail;
1534                         }
1535
1536                         if (chroot(".") < 0) {
1537                                 log_error("chroot() failed: %m");
1538                                 goto child_fail;
1539                         }
1540
1541                         if (chdir("/") < 0) {
1542                                 log_error("chdir() failed: %m");
1543                                 goto child_fail;
1544                         }
1545
1546                         umask(0022);
1547
1548                         loopback_setup();
1549
1550                         if (drop_capabilities() < 0) {
1551                                 log_error("drop_capabilities() failed: %m");
1552                                 goto child_fail;
1553                         }
1554
1555                         if (arg_user) {
1556
1557                                 /* Note that this resolves user names
1558                                  * inside the container, and hence
1559                                  * accesses the NSS modules from the
1560                                  * container and not the host. This is
1561                                  * a bit weird... */
1562
1563                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1564                                         log_error("get_user_creds() failed: %m");
1565                                         goto child_fail;
1566                                 }
1567
1568                                 if (mkdir_parents_label(home, 0775) < 0) {
1569                                         log_error("mkdir_parents_label() failed: %m");
1570                                         goto child_fail;
1571                                 }
1572
1573                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1574                                         log_error("mkdir_safe_label() failed: %m");
1575                                         goto child_fail;
1576                                 }
1577
1578                                 if (initgroups((const char*)arg_user, gid) < 0) {
1579                                         log_error("initgroups() failed: %m");
1580                                         goto child_fail;
1581                                 }
1582
1583                                 if (setresgid(gid, gid, gid) < 0) {
1584                                         log_error("setregid() failed: %m");
1585                                         goto child_fail;
1586                                 }
1587
1588                                 if (setresuid(uid, uid, uid) < 0) {
1589                                         log_error("setreuid() failed: %m");
1590                                         goto child_fail;
1591                                 }
1592                         } else {
1593                                 /* Reset everything fully to 0, just in case */
1594
1595                                 if (setgroups(0, NULL) < 0) {
1596                                         log_error("setgroups() failed: %m");
1597                                         goto child_fail;
1598                                 }
1599
1600                                 if (setresgid(0, 0, 0) < 0) {
1601                                         log_error("setregid() failed: %m");
1602                                         goto child_fail;
1603                                 }
1604
1605                                 if (setresuid(0, 0, 0) < 0) {
1606                                         log_error("setreuid() failed: %m");
1607                                         goto child_fail;
1608                                 }
1609                         }
1610
1611                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1612                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1613                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1614                                 log_oom();
1615                                 goto child_fail;
1616                         }
1617
1618                         if (arg_uuid) {
1619                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1620                                         log_oom();
1621                                         goto child_fail;
1622                                 }
1623                         }
1624
1625                         if (fdset_size(fds) > 0) {
1626                                 k = fdset_cloexec(fds, false);
1627                                 if (k < 0) {
1628                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1629                                         goto child_fail;
1630                                 }
1631
1632                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1633                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1634                                         log_oom();
1635                                         goto child_fail;
1636                                 }
1637                         }
1638
1639                         setup_hostname();
1640
1641                         if (arg_boot) {
1642                                 char **a;
1643                                 size_t l;
1644
1645                                 /* Automatically search for the init system */
1646
1647                                 l = 1 + argc - optind;
1648                                 a = newa(char*, l + 1);
1649                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1650
1651                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1652                                 execve(a[0], a, (char**) envp);
1653
1654                                 a[0] = (char*) "/lib/systemd/systemd";
1655                                 execve(a[0], a, (char**) envp);
1656
1657                                 a[0] = (char*) "/sbin/init";
1658                                 execve(a[0], a, (char**) envp);
1659                         } else if (argc > optind)
1660                                 execvpe(argv[optind], argv + optind, (char**) envp);
1661                         else {
1662                                 chdir(home ? home : "/root");
1663                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1664                         }
1665
1666                         log_error("execv() failed: %m");
1667
1668                 child_fail:
1669                         _exit(EXIT_FAILURE);
1670                 }
1671
1672                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1673                 close_nointr_nofail(pipefd[0]);
1674                 close_nointr_nofail(pipefd[1]);
1675
1676                 /* Wait for the child process to establish cgroup hierarchy */
1677                 close_nointr_nofail(pipefd2[1]);
1678                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1679                 close_nointr_nofail(pipefd2[0]);
1680
1681                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1682
1683                 fdset_free(fds);
1684                 fds = NULL;
1685
1686                 if (process_pty(master, pid, &mask) < 0)
1687                         goto finish;
1688
1689                 if (saved_attr_valid)
1690                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1691
1692                 k = wait_for_terminate(pid, &status);
1693                 if (k < 0) {
1694                         r = EXIT_FAILURE;
1695                         break;
1696                 }
1697
1698                 if (status.si_code == CLD_EXITED) {
1699                         r = status.si_status;
1700                         if (status.si_status != 0) {
1701                                 log_error("Container failed with error code %i.", status.si_status);
1702                                 break;
1703                         }
1704
1705                         log_debug("Container exited successfully.");
1706                         break;
1707                 } else if (status.si_code == CLD_KILLED &&
1708                            status.si_status == SIGINT) {
1709                         log_info("Container has been shut down.");
1710                         r = 0;
1711                         break;
1712                 } else if (status.si_code == CLD_KILLED &&
1713                            status.si_status == SIGHUP) {
1714                         log_info("Container is being rebooted.");
1715                         continue;
1716                 } else if (status.si_code == CLD_KILLED ||
1717                            status.si_code == CLD_DUMPED) {
1718
1719                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1720                         r = EXIT_FAILURE;
1721                         break;
1722                 } else {
1723                         log_error("Container failed due to unknown reason.");
1724                         r = EXIT_FAILURE;
1725                         break;
1726                 }
1727         }
1728
1729 finish:
1730         if (saved_attr_valid)
1731                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1732
1733         close_pipe(kmsg_socket_pair);
1734
1735         if (newcg)
1736                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1737
1738         free(arg_directory);
1739         free(arg_machine);
1740         strv_free(arg_controllers);
1741
1742         fdset_free(fds);
1743
1744         return r;
1745 }