chiark / gitweb /
nspawn: Include netlink headers rather than using #ifdef
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <linux/netlink.h>
43 #include <sys/un.h>
44 #include <sys/socket.h>
45
46 #ifdef HAVE_XATTR
47 #include <attr/xattr.h>
48 #endif
49
50 #include <systemd/sd-daemon.h>
51
52 #include "log.h"
53 #include "util.h"
54 #include "mkdir.h"
55 #include "macro.h"
56 #include "audit.h"
57 #include "missing.h"
58 #include "cgroup-util.h"
59 #include "strv.h"
60 #include "path-util.h"
61 #include "loopback-setup.h"
62 #include "sd-id128.h"
63 #include "dev-setup.h"
64 #include "fdset.h"
65 #include "build.h"
66 #include "fileio.h"
67
68 #ifndef TTY_GID
69 #define TTY_GID 5
70 #endif
71
72 typedef enum LinkJournal {
73         LINK_NO,
74         LINK_AUTO,
75         LINK_HOST,
76         LINK_GUEST
77 } LinkJournal;
78
79 static char *arg_directory = NULL;
80 static char *arg_user = NULL;
81 static char **arg_controllers = NULL;
82 static char *arg_uuid = NULL;
83 static char *arg_machine = NULL;
84 static bool arg_private_network = false;
85 static bool arg_read_only = false;
86 static bool arg_boot = false;
87 static LinkJournal arg_link_journal = LINK_AUTO;
88 static uint64_t arg_retain =
89         (1ULL << CAP_CHOWN) |
90         (1ULL << CAP_DAC_OVERRIDE) |
91         (1ULL << CAP_DAC_READ_SEARCH) |
92         (1ULL << CAP_FOWNER) |
93         (1ULL << CAP_FSETID) |
94         (1ULL << CAP_IPC_OWNER) |
95         (1ULL << CAP_KILL) |
96         (1ULL << CAP_LEASE) |
97         (1ULL << CAP_LINUX_IMMUTABLE) |
98         (1ULL << CAP_NET_BIND_SERVICE) |
99         (1ULL << CAP_NET_BROADCAST) |
100         (1ULL << CAP_NET_RAW) |
101         (1ULL << CAP_SETGID) |
102         (1ULL << CAP_SETFCAP) |
103         (1ULL << CAP_SETPCAP) |
104         (1ULL << CAP_SETUID) |
105         (1ULL << CAP_SYS_ADMIN) |
106         (1ULL << CAP_SYS_CHROOT) |
107         (1ULL << CAP_SYS_NICE) |
108         (1ULL << CAP_SYS_PTRACE) |
109         (1ULL << CAP_SYS_TTY_CONFIG) |
110         (1ULL << CAP_SYS_RESOURCE) |
111         (1ULL << CAP_SYS_BOOT) |
112         (1ULL << CAP_AUDIT_WRITE) |
113         (1ULL << CAP_AUDIT_CONTROL);
114 static char **arg_bind = NULL;
115 static char **arg_bind_ro = NULL;
116
117 static int help(void) {
118
119         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
120                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
121                "  -h --help                Show this help\n"
122                "     --version             Print version string\n"
123                "  -D --directory=NAME      Root directory for the container\n"
124                "  -b --boot                Boot up full system (i.e. invoke init)\n"
125                "  -u --user=USER           Run the command under specified user or uid\n"
126                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
127                "                           cgroup hierarchies\n"
128                "     --uuid=UUID           Set a specific machine UUID for the container\n"
129                "  -M --machine=NAME        Set the machine name for the container\n"
130                "     --private-network     Disable network in container\n"
131                "     --read-only           Mount the root directory read-only\n"
132                "     --capability=CAP      In addition to the default, retain specified\n"
133                "                           capability\n"
134                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
135                "  -j                       Equivalent to --link-journal=host\n"
136                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
137                "                           the container\n"
138                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
139                program_invocation_short_name);
140
141         return 0;
142 }
143
144 static int parse_argv(int argc, char *argv[]) {
145
146         enum {
147                 ARG_VERSION = 0x100,
148                 ARG_PRIVATE_NETWORK,
149                 ARG_UUID,
150                 ARG_READ_ONLY,
151                 ARG_CAPABILITY,
152                 ARG_LINK_JOURNAL,
153                 ARG_BIND,
154                 ARG_BIND_RO
155         };
156
157         static const struct option options[] = {
158                 { "help",            no_argument,       NULL, 'h'                 },
159                 { "version",         no_argument,       NULL, ARG_VERSION         },
160                 { "directory",       required_argument, NULL, 'D'                 },
161                 { "user",            required_argument, NULL, 'u'                 },
162                 { "controllers",     required_argument, NULL, 'C'                 },
163                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
164                 { "boot",            no_argument,       NULL, 'b'                 },
165                 { "uuid",            required_argument, NULL, ARG_UUID            },
166                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
167                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
168                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
169                 { "bind",            required_argument, NULL, ARG_BIND            },
170                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
171                 { "machine",         required_argument, NULL, 'M'                 },
172                 { NULL,              0,                 NULL, 0                   }
173         };
174
175         int c;
176
177         assert(argc >= 0);
178         assert(argv);
179
180         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
181
182                 switch (c) {
183
184                 case 'h':
185                         help();
186                         return 0;
187
188                 case ARG_VERSION:
189                         puts(PACKAGE_STRING);
190                         puts(SYSTEMD_FEATURES);
191                         return 0;
192
193                 case 'D':
194                         free(arg_directory);
195                         arg_directory = canonicalize_file_name(optarg);
196                         if (!arg_directory) {
197                                 log_error("Failed to canonicalize root directory.");
198                                 return -ENOMEM;
199                         }
200
201                         break;
202
203                 case 'u':
204                         free(arg_user);
205                         arg_user = strdup(optarg);
206                         if (!arg_user)
207                                 return log_oom();
208
209                         break;
210
211                 case 'C':
212                         strv_free(arg_controllers);
213                         arg_controllers = strv_split(optarg, ",");
214                         if (!arg_controllers)
215                                 return log_oom();
216
217                         cg_shorten_controllers(arg_controllers);
218                         break;
219
220                 case ARG_PRIVATE_NETWORK:
221                         arg_private_network = true;
222                         break;
223
224                 case 'b':
225                         arg_boot = true;
226                         break;
227
228                 case ARG_UUID:
229                         if (!id128_is_valid(optarg)) {
230                                 log_error("Invalid UUID: %s", optarg);
231                                 return -EINVAL;
232                         }
233
234                         arg_uuid = optarg;
235                         break;
236
237                 case 'M':
238                         if (!hostname_is_valid(optarg)) {
239                                 log_error("Invalid machine name: %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         free(arg_machine);
244                         arg_machine = strdup(optarg);
245                         if (!arg_machine)
246                                 return log_oom();
247
248                         break;
249
250                 case ARG_READ_ONLY:
251                         arg_read_only = true;
252                         break;
253
254                 case ARG_CAPABILITY: {
255                         char *state, *word;
256                         size_t length;
257
258                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
259                                 cap_value_t cap;
260                                 char *t;
261
262                                 t = strndup(word, length);
263                                 if (!t)
264                                         return log_oom();
265
266                                 if (cap_from_name(t, &cap) < 0) {
267                                         log_error("Failed to parse capability %s.", t);
268                                         free(t);
269                                         return -EINVAL;
270                                 }
271
272                                 free(t);
273                                 arg_retain |= 1ULL << (uint64_t) cap;
274                         }
275
276                         break;
277                 }
278
279                 case 'j':
280                         arg_link_journal = LINK_GUEST;
281                         break;
282
283                 case ARG_LINK_JOURNAL:
284                         if (streq(optarg, "auto"))
285                                 arg_link_journal = LINK_AUTO;
286                         else if (streq(optarg, "no"))
287                                 arg_link_journal = LINK_NO;
288                         else if (streq(optarg, "guest"))
289                                 arg_link_journal = LINK_GUEST;
290                         else if (streq(optarg, "host"))
291                                 arg_link_journal = LINK_HOST;
292                         else {
293                                 log_error("Failed to parse link journal mode %s", optarg);
294                                 return -EINVAL;
295                         }
296
297                         break;
298
299                 case ARG_BIND:
300                 case ARG_BIND_RO: {
301                         _cleanup_free_ char *a = NULL, *b = NULL;
302                         char *e;
303                         char ***x;
304                         int r;
305
306                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
307
308                         e = strchr(optarg, ':');
309                         if (e) {
310                                 a = strndup(optarg, e - optarg);
311                                 b = strdup(e + 1);
312                         } else {
313                                 a = strdup(optarg);
314                                 b = strdup(optarg);
315                         }
316
317                         if (!a || !b)
318                                 return log_oom();
319
320                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
321                                 log_error("Invalid bind mount specification: %s", optarg);
322                                 return -EINVAL;
323                         }
324
325                         r = strv_extend(x, a);
326                         if (r < 0)
327                                 return r;
328
329                         r = strv_extend(x, b);
330                         if (r < 0)
331                                 return r;
332
333                         break;
334                 }
335
336                 case '?':
337                         return -EINVAL;
338
339                 default:
340                         log_error("Unknown option code %c", c);
341                         return -EINVAL;
342                 }
343         }
344
345         return 1;
346 }
347
348 static int mount_all(const char *dest) {
349
350         typedef struct MountPoint {
351                 const char *what;
352                 const char *where;
353                 const char *type;
354                 const char *options;
355                 unsigned long flags;
356                 bool fatal;
357         } MountPoint;
358
359         static const MountPoint mount_table[] = {
360                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
361                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
362                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
363                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
364                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
365                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
366                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
368 #ifdef HAVE_SELINUX
369                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
370                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
371 #endif
372         };
373
374         unsigned k;
375         int r = 0;
376
377         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
378                 _cleanup_free_ char *where = NULL;
379                 int t;
380
381                 where = strjoin(dest, "/", mount_table[k].where, NULL);
382                 if (!where)
383                         return log_oom();
384
385                 t = path_is_mount_point(where, true);
386                 if (t < 0) {
387                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
388
389                         if (r == 0)
390                                 r = t;
391
392                         continue;
393                 }
394
395                 /* Skip this entry if it is not a remount. */
396                 if (mount_table[k].what && t > 0)
397                         continue;
398
399                 mkdir_p(where, 0755);
400
401                 if (mount(mount_table[k].what,
402                           where,
403                           mount_table[k].type,
404                           mount_table[k].flags,
405                           mount_table[k].options) < 0 &&
406                     mount_table[k].fatal) {
407
408                         log_error("mount(%s) failed: %m", where);
409
410                         if (r == 0)
411                                 r = -errno;
412                 }
413         }
414
415         return r;
416 }
417
418 static int mount_binds(const char *dest, char **l, unsigned long flags) {
419         char **x, **y;
420
421         STRV_FOREACH_PAIR(x, y, l) {
422                 _cleanup_free_ char *where = NULL;
423
424                 where = strjoin(dest, "/", *y, NULL);
425                 if (!where)
426                         return log_oom();
427
428                 mkdir_p_label(where, 0755);
429
430                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
431                         log_error("mount(%s) failed: %m", where);
432                         return -errno;
433                 }
434
435                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
436                         log_error("mount(%s) failed: %m", where);
437                         return -errno;
438                 }
439         }
440
441         return 0;
442 }
443
444 static int setup_timezone(const char *dest) {
445         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
446         char *z, *y;
447         int r;
448
449         assert(dest);
450
451         /* Fix the timezone, if possible */
452         r = readlink_malloc("/etc/localtime", &p);
453         if (r < 0) {
454                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
455                 return 0;
456         }
457
458         z = path_startswith(p, "../usr/share/zoneinfo/");
459         if (!z)
460                 z = path_startswith(p, "/usr/share/zoneinfo/");
461         if (!z) {
462                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
463                 return 0;
464         }
465
466         where = strappend(dest, "/etc/localtime");
467         if (!where)
468                 return log_oom();
469
470         r = readlink_malloc(where, &q);
471         if (r >= 0) {
472                 y = path_startswith(q, "../usr/share/zoneinfo/");
473                 if (!y)
474                         y = path_startswith(q, "/usr/share/zoneinfo/");
475
476
477                 /* Already pointing to the right place? Then do nothing .. */
478                 if (y && streq(y, z))
479                         return 0;
480         }
481
482         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
483         if (!check)
484                 return log_oom();
485
486         if (access(check, F_OK) < 0) {
487                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
488                 return 0;
489         }
490
491         what = strappend("../usr/share/zoneinfo/", z);
492         if (!what)
493                 return log_oom();
494
495         unlink(where);
496         if (symlink(what, where) < 0) {
497                 log_error("Failed to correct timezone of container: %m");
498                 return 0;
499         }
500
501         return 0;
502 }
503
504 static int setup_resolv_conf(const char *dest) {
505         char _cleanup_free_ *where = NULL;
506         _cleanup_close_ int fd = -1;
507
508         assert(dest);
509
510         if (arg_private_network)
511                 return 0;
512
513         /* Fix resolv.conf, if possible */
514         where = strappend(dest, "/etc/resolv.conf");
515         if (!where)
516                 return log_oom();
517
518         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
519
520         /* We don't really care for the results of this really. If it
521          * fails, it fails, but meh... */
522         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
523                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
524         else
525                 if (mount("/etc/resolv.conf", where, "bind",
526                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
527                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
528                         return -errno;
529                 }
530
531         return 0;
532 }
533
534 static int setup_boot_id(const char *dest) {
535         _cleanup_free_ char *from = NULL, *to = NULL;
536         sd_id128_t rnd;
537         char as_uuid[37];
538         int r;
539
540         assert(dest);
541
542         /* Generate a new randomized boot ID, so that each boot-up of
543          * the container gets a new one */
544
545         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
546         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
547         if (!from || !to)
548                 return log_oom();
549
550         r = sd_id128_randomize(&rnd);
551         if (r < 0) {
552                 log_error("Failed to generate random boot id: %s", strerror(-r));
553                 return r;
554         }
555
556         snprintf(as_uuid, sizeof(as_uuid),
557                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
558                  SD_ID128_FORMAT_VAL(rnd));
559         char_array_0(as_uuid);
560
561         r = write_string_file(from, as_uuid);
562         if (r < 0) {
563                 log_error("Failed to write boot id: %s", strerror(-r));
564                 return r;
565         }
566
567         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
568                 log_error("Failed to bind mount boot id: %m");
569                 r = -errno;
570         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
571                 log_warning("Failed to make boot id read-only: %m");
572
573         unlink(from);
574         return r;
575 }
576
577 static int copy_devnodes(const char *dest) {
578
579         static const char devnodes[] =
580                 "null\0"
581                 "zero\0"
582                 "full\0"
583                 "random\0"
584                 "urandom\0"
585                 "tty\0";
586
587         const char *d;
588         int r = 0;
589         _cleanup_umask_ mode_t u;
590
591         assert(dest);
592
593         u = umask(0000);
594
595         NULSTR_FOREACH(d, devnodes) {
596                 struct stat st;
597                 _cleanup_free_ char *from = NULL, *to = NULL;
598
599                 asprintf(&from, "/dev/%s", d);
600                 asprintf(&to, "%s/dev/%s", dest, d);
601
602                 if (!from || !to) {
603                         log_oom();
604
605                         if (r == 0)
606                                 r = -ENOMEM;
607
608                         break;
609                 }
610
611                 if (stat(from, &st) < 0) {
612
613                         if (errno != ENOENT) {
614                                 log_error("Failed to stat %s: %m", from);
615                                 if (r == 0)
616                                         r = -errno;
617                         }
618
619                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
620
621                         log_error("%s is not a char or block device, cannot copy", from);
622                         if (r == 0)
623                                 r = -EIO;
624
625                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
626
627                         log_error("mknod(%s) failed: %m", dest);
628                         if (r == 0)
629                                 r = -errno;
630                 }
631         }
632
633         return r;
634 }
635
636 static int setup_ptmx(const char *dest) {
637         _cleanup_free_ char *p = NULL;
638
639         p = strappend(dest, "/dev/ptmx");
640         if (!p)
641                 return log_oom();
642
643         if (symlink("pts/ptmx", p) < 0) {
644                 log_error("Failed to create /dev/ptmx symlink: %m");
645                 return -errno;
646         }
647
648         return 0;
649 }
650
651 static int setup_dev_console(const char *dest, const char *console) {
652         struct stat st;
653         _cleanup_free_ char *to = NULL;
654         int r;
655         _cleanup_umask_ mode_t u;
656
657         assert(dest);
658         assert(console);
659
660         u = umask(0000);
661
662         if (stat(console, &st) < 0) {
663                 log_error("Failed to stat %s: %m", console);
664                 return -errno;
665
666         } else if (!S_ISCHR(st.st_mode)) {
667                 log_error("/dev/console is not a char device");
668                 return -EIO;
669         }
670
671         r = chmod_and_chown(console, 0600, 0, 0);
672         if (r < 0) {
673                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
674                 return r;
675         }
676
677         if (asprintf(&to, "%s/dev/console", dest) < 0)
678                 return log_oom();
679
680         /* We need to bind mount the right tty to /dev/console since
681          * ptys can only exist on pts file systems. To have something
682          * to bind mount things on we create a device node first, that
683          * has the right major/minor (note that the major minor
684          * doesn't actually matter here, since we mount it over
685          * anyway). */
686
687         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
688                 log_error("mknod() for /dev/console failed: %m");
689                 return -errno;
690         }
691
692         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
693                 log_error("Bind mount for /dev/console failed: %m");
694                 return -errno;
695         }
696
697         return 0;
698 }
699
700 static int setup_kmsg(const char *dest, int kmsg_socket) {
701         _cleanup_free_ char *from = NULL, *to = NULL;
702         int r, fd, k;
703         _cleanup_umask_ mode_t u;
704         union {
705                 struct cmsghdr cmsghdr;
706                 uint8_t buf[CMSG_SPACE(sizeof(int))];
707         } control = {};
708         struct msghdr mh = {
709                 .msg_control = &control,
710                 .msg_controllen = sizeof(control),
711         };
712         struct cmsghdr *cmsg;
713
714         assert(dest);
715         assert(kmsg_socket >= 0);
716
717         u = umask(0000);
718
719         /* We create the kmsg FIFO as /dev/kmsg, but immediately
720          * delete it after bind mounting it to /proc/kmsg. While FIFOs
721          * on the reading side behave very similar to /proc/kmsg,
722          * their writing side behaves differently from /dev/kmsg in
723          * that writing blocks when nothing is reading. In order to
724          * avoid any problems with containers deadlocking due to this
725          * we simply make /dev/kmsg unavailable to the container. */
726         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
727             asprintf(&to, "%s/proc/kmsg", dest) < 0)
728                 return log_oom();
729
730         if (mkfifo(from, 0600) < 0) {
731                 log_error("mkfifo() for /dev/kmsg failed: %m");
732                 return -errno;
733         }
734
735         r = chmod_and_chown(from, 0600, 0, 0);
736         if (r < 0) {
737                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
738                 return r;
739         }
740
741         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
742                 log_error("Bind mount for /proc/kmsg failed: %m");
743                 return -errno;
744         }
745
746         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
747         if (fd < 0) {
748                 log_error("Failed to open fifo: %m");
749                 return -errno;
750         }
751
752         cmsg = CMSG_FIRSTHDR(&mh);
753         cmsg->cmsg_level = SOL_SOCKET;
754         cmsg->cmsg_type = SCM_RIGHTS;
755         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
756         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
757
758         mh.msg_controllen = cmsg->cmsg_len;
759
760         /* Store away the fd in the socket, so that it stays open as
761          * long as we run the child */
762         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
763         close_nointr_nofail(fd);
764
765         if (k < 0) {
766                 log_error("Failed to send FIFO fd: %m");
767                 return -errno;
768         }
769
770         /* And now make the FIFO unavailable as /dev/kmsg... */
771         unlink(from);
772         return 0;
773 }
774
775 static int setup_hostname(void) {
776
777         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
778                 return -errno;
779
780         return 0;
781 }
782
783 static int setup_journal(const char *directory) {
784         sd_id128_t machine_id;
785         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
786         char *id;
787         int r;
788
789         if (arg_link_journal == LINK_NO)
790                 return 0;
791
792         p = strappend(directory, "/etc/machine-id");
793         if (!p)
794                 return log_oom();
795
796         r = read_one_line_file(p, &b);
797         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
798                 return 0;
799         else if (r < 0) {
800                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
801                 return r;
802         }
803
804         id = strstrip(b);
805         if (isempty(id) && arg_link_journal == LINK_AUTO)
806                 return 0;
807
808         /* Verify validity */
809         r = sd_id128_from_string(id, &machine_id);
810         if (r < 0) {
811                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
812                 return r;
813         }
814
815         free(p);
816         p = strappend("/var/log/journal/", id);
817         q = strjoin(directory, "/var/log/journal/", id, NULL);
818         if (!p || !q)
819                 return log_oom();
820
821         if (path_is_mount_point(p, false) > 0) {
822                 if (arg_link_journal != LINK_AUTO) {
823                         log_error("%s: already a mount point, refusing to use for journal", p);
824                         return -EEXIST;
825                 }
826
827                 return 0;
828         }
829
830         if (path_is_mount_point(q, false) > 0) {
831                 if (arg_link_journal != LINK_AUTO) {
832                         log_error("%s: already a mount point, refusing to use for journal", q);
833                         return -EEXIST;
834                 }
835
836                 return 0;
837         }
838
839         r = readlink_and_make_absolute(p, &d);
840         if (r >= 0) {
841                 if ((arg_link_journal == LINK_GUEST ||
842                      arg_link_journal == LINK_AUTO) &&
843                     path_equal(d, q)) {
844
845                         r = mkdir_p(q, 0755);
846                         if (r < 0)
847                                 log_warning("failed to create directory %s: %m", q);
848                         return 0;
849                 }
850
851                 if (unlink(p) < 0) {
852                         log_error("Failed to remove symlink %s: %m", p);
853                         return -errno;
854                 }
855         } else if (r == -EINVAL) {
856
857                 if (arg_link_journal == LINK_GUEST &&
858                     rmdir(p) < 0) {
859
860                         if (errno == ENOTDIR) {
861                                 log_error("%s already exists and is neither a symlink nor a directory", p);
862                                 return r;
863                         } else {
864                                 log_error("Failed to remove %s: %m", p);
865                                 return -errno;
866                         }
867                 }
868         } else if (r != -ENOENT) {
869                 log_error("readlink(%s) failed: %m", p);
870                 return r;
871         }
872
873         if (arg_link_journal == LINK_GUEST) {
874
875                 if (symlink(q, p) < 0) {
876                         log_error("Failed to symlink %s to %s: %m", q, p);
877                         return -errno;
878                 }
879
880                 r = mkdir_p(q, 0755);
881                 if (r < 0)
882                         log_warning("failed to create directory %s: %m", q);
883                 return 0;
884         }
885
886         if (arg_link_journal == LINK_HOST) {
887                 r = mkdir_p(p, 0755);
888                 if (r < 0) {
889                         log_error("Failed to create %s: %m", p);
890                         return r;
891                 }
892
893         } else if (access(p, F_OK) < 0)
894                 return 0;
895
896         if (dir_is_empty(q) == 0) {
897                 log_error("%s not empty.", q);
898                 return -ENOTEMPTY;
899         }
900
901         r = mkdir_p(q, 0755);
902         if (r < 0) {
903                 log_error("Failed to create %s: %m", q);
904                 return r;
905         }
906
907         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
908                 log_error("Failed to bind mount journal from host into guest: %m");
909                 return -errno;
910         }
911
912         return 0;
913 }
914
915 static int setup_cgroup(const char *path) {
916         char **c;
917         int r;
918
919         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
920         if (r < 0) {
921                 log_error("Failed to create cgroup: %s", strerror(-r));
922                 return r;
923         }
924
925         STRV_FOREACH(c, arg_controllers) {
926                 r = cg_create_and_attach(*c, path, 1);
927                 if (r < 0)
928                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
929         }
930
931         return 0;
932 }
933
934 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
935 #ifdef HAVE_XATTR
936         _cleanup_free_ char *path = NULL;
937         char buf[DECIMAL_STR_MAX(pid_t)];
938         int r = 0, k;
939
940         assert(cgroup);
941         assert(pid >= 0);
942         assert(arg_directory);
943
944         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
945
946         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
947         if (r < 0) {
948                 log_error("Failed to get path: %s", strerror(-r));
949                 return r;
950         }
951
952         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
953         if (r < 0)
954                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
955
956         if (uuid) {
957                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
958                 if (k < 0) {
959                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
960                         if (r == 0)
961                                 r = k;
962                 }
963         }
964
965         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
966         if (k < 0) {
967                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
968                 if (r == 0)
969                         r = k;
970         }
971         return r;
972 #else
973         return 0;
974 #endif
975 }
976
977 static int drop_capabilities(void) {
978         return capability_bounding_set_drop(~arg_retain, false);
979 }
980
981 static int process_pty(int master, pid_t pid, sigset_t *mask) {
982
983         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
984         size_t in_buffer_full = 0, out_buffer_full = 0;
985         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
986         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
987         int ep = -1, signal_fd = -1, r;
988         bool tried_orderly_shutdown = false;
989
990         assert(master >= 0);
991         assert(pid > 0);
992         assert(mask);
993
994         fd_nonblock(STDIN_FILENO, 1);
995         fd_nonblock(STDOUT_FILENO, 1);
996         fd_nonblock(master, 1);
997
998         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
999         if (signal_fd < 0) {
1000                 log_error("signalfd(): %m");
1001                 r = -errno;
1002                 goto finish;
1003         }
1004
1005         ep = epoll_create1(EPOLL_CLOEXEC);
1006         if (ep < 0) {
1007                 log_error("Failed to create epoll: %m");
1008                 r = -errno;
1009                 goto finish;
1010         }
1011
1012         /* We read from STDIN only if this is actually a TTY,
1013          * otherwise we assume non-interactivity. */
1014         if (isatty(STDIN_FILENO)) {
1015                 zero(stdin_ev);
1016                 stdin_ev.events = EPOLLIN|EPOLLET;
1017                 stdin_ev.data.fd = STDIN_FILENO;
1018
1019                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1020                         log_error("Failed to register STDIN in epoll: %m");
1021                         r = -errno;
1022                         goto finish;
1023                 }
1024         }
1025
1026         zero(stdout_ev);
1027         stdout_ev.events = EPOLLOUT|EPOLLET;
1028         stdout_ev.data.fd = STDOUT_FILENO;
1029
1030         zero(master_ev);
1031         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1032         master_ev.data.fd = master;
1033
1034         zero(signal_ev);
1035         signal_ev.events = EPOLLIN;
1036         signal_ev.data.fd = signal_fd;
1037
1038         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1039                 if (errno != EPERM) {
1040                         log_error("Failed to register stdout in epoll: %m");
1041                         r = -errno;
1042                         goto finish;
1043                 }
1044                 /* stdout without epoll support. Likely redirected to regular file. */
1045                 stdout_writable = true;
1046         }
1047
1048         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1049             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1050                 log_error("Failed to register fds in epoll: %m");
1051                 r = -errno;
1052                 goto finish;
1053         }
1054
1055         for (;;) {
1056                 struct epoll_event ev[16];
1057                 ssize_t k;
1058                 int i, nfds;
1059
1060                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1061                 if (nfds < 0) {
1062
1063                         if (errno == EINTR || errno == EAGAIN)
1064                                 continue;
1065
1066                         log_error("epoll_wait(): %m");
1067                         r = -errno;
1068                         goto finish;
1069                 }
1070
1071                 assert(nfds >= 1);
1072
1073                 for (i = 0; i < nfds; i++) {
1074                         if (ev[i].data.fd == STDIN_FILENO) {
1075
1076                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1077                                         stdin_readable = true;
1078
1079                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1080
1081                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1082                                         stdout_writable = true;
1083
1084                         } else if (ev[i].data.fd == master) {
1085
1086                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1087                                         master_readable = true;
1088
1089                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1090                                         master_writable = true;
1091
1092                         } else if (ev[i].data.fd == signal_fd) {
1093                                 struct signalfd_siginfo sfsi;
1094                                 ssize_t n;
1095
1096                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1097                                 if (n != sizeof(sfsi)) {
1098
1099                                         if (n >= 0) {
1100                                                 log_error("Failed to read from signalfd: invalid block size");
1101                                                 r = -EIO;
1102                                                 goto finish;
1103                                         }
1104
1105                                         if (errno != EINTR && errno != EAGAIN) {
1106                                                 log_error("Failed to read from signalfd: %m");
1107                                                 r = -errno;
1108                                                 goto finish;
1109                                         }
1110                                 } else {
1111
1112                                         if (sfsi.ssi_signo == SIGWINCH) {
1113                                                 struct winsize ws;
1114
1115                                                 /* The window size changed, let's forward that. */
1116                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1117                                                         ioctl(master, TIOCSWINSZ, &ws);
1118                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1119
1120                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1121
1122                                                 /* This only works for systemd... */
1123                                                 tried_orderly_shutdown = true;
1124                                                 kill(pid, SIGRTMIN+3);
1125
1126                                         } else {
1127                                                 r = 0;
1128                                                 goto finish;
1129                                         }
1130                                 }
1131                         }
1132                 }
1133
1134                 while ((stdin_readable && in_buffer_full <= 0) ||
1135                        (master_writable && in_buffer_full > 0) ||
1136                        (master_readable && out_buffer_full <= 0) ||
1137                        (stdout_writable && out_buffer_full > 0)) {
1138
1139                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1140
1141                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1142                                 if (k < 0) {
1143
1144                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1145                                                 stdin_readable = false;
1146                                         else {
1147                                                 log_error("read(): %m");
1148                                                 r = -errno;
1149                                                 goto finish;
1150                                         }
1151                                 } else
1152                                         in_buffer_full += (size_t) k;
1153                         }
1154
1155                         if (master_writable && in_buffer_full > 0) {
1156
1157                                 k = write(master, in_buffer, in_buffer_full);
1158                                 if (k < 0) {
1159
1160                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1161                                                 master_writable = false;
1162                                         else {
1163                                                 log_error("write(): %m");
1164                                                 r = -errno;
1165                                                 goto finish;
1166                                         }
1167
1168                                 } else {
1169                                         assert(in_buffer_full >= (size_t) k);
1170                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1171                                         in_buffer_full -= k;
1172                                 }
1173                         }
1174
1175                         if (master_readable && out_buffer_full < LINE_MAX) {
1176
1177                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1178                                 if (k < 0) {
1179
1180                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1181                                                 master_readable = false;
1182                                         else {
1183                                                 log_error("read(): %m");
1184                                                 r = -errno;
1185                                                 goto finish;
1186                                         }
1187                                 }  else
1188                                         out_buffer_full += (size_t) k;
1189                         }
1190
1191                         if (stdout_writable && out_buffer_full > 0) {
1192
1193                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1194                                 if (k < 0) {
1195
1196                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1197                                                 stdout_writable = false;
1198                                         else {
1199                                                 log_error("write(): %m");
1200                                                 r = -errno;
1201                                                 goto finish;
1202                                         }
1203
1204                                 } else {
1205                                         assert(out_buffer_full >= (size_t) k);
1206                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1207                                         out_buffer_full -= k;
1208                                 }
1209                         }
1210                 }
1211         }
1212
1213 finish:
1214         if (ep >= 0)
1215                 close_nointr_nofail(ep);
1216
1217         if (signal_fd >= 0)
1218                 close_nointr_nofail(signal_fd);
1219
1220         return r;
1221 }
1222
1223 static bool audit_enabled(void) {
1224         int fd;
1225
1226         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1227         if (fd >= 0) {
1228                 close_nointr_nofail(fd);
1229                 return true;
1230         }
1231         return false;
1232 }
1233
1234 int main(int argc, char *argv[]) {
1235         pid_t pid = 0;
1236         int r = EXIT_FAILURE, k;
1237         _cleanup_free_ char *newcg = NULL;
1238         _cleanup_close_ int master = -1;
1239         int n_fd_passed;
1240         const char *console = NULL;
1241         struct termios saved_attr, raw_attr;
1242         sigset_t mask;
1243         bool saved_attr_valid = false;
1244         struct winsize ws;
1245         int kmsg_socket_pair[2] = { -1, -1 };
1246         FDSet *fds = NULL;
1247
1248         log_parse_environment();
1249         log_open();
1250
1251         k = parse_argv(argc, argv);
1252         if (k < 0)
1253                 goto finish;
1254         else if (k == 0) {
1255                 r = EXIT_SUCCESS;
1256                 goto finish;
1257         }
1258
1259         if (arg_directory) {
1260                 char *p;
1261
1262                 p = path_make_absolute_cwd(arg_directory);
1263                 free(arg_directory);
1264                 arg_directory = p;
1265         } else
1266                 arg_directory = get_current_dir_name();
1267
1268         if (!arg_directory) {
1269                 log_error("Failed to determine path, please use -D.");
1270                 goto finish;
1271         }
1272
1273         path_kill_slashes(arg_directory);
1274
1275         if (!arg_machine) {
1276                 arg_machine = strdup(path_get_file_name(arg_directory));
1277                 if (!arg_machine) {
1278                         log_oom();
1279                         goto finish;
1280                 }
1281
1282                 hostname_cleanup(arg_machine, false);
1283                 if (isempty(arg_machine)) {
1284                         log_error("Failed to determine machine name automatically, please use -M.");
1285                         goto finish;
1286                 }
1287         }
1288
1289         if (geteuid() != 0) {
1290                 log_error("Need to be root.");
1291                 goto finish;
1292         }
1293
1294         if (sd_booted() <= 0) {
1295                 log_error("Not running on a systemd system.");
1296                 goto finish;
1297         }
1298
1299         if (audit_enabled()) {
1300                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1301                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1302                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1303                 sleep(5);
1304         }
1305
1306         if (path_equal(arg_directory, "/")) {
1307                 log_error("Spawning container on root directory not supported.");
1308                 goto finish;
1309         }
1310
1311         if (path_is_os_tree(arg_directory) <= 0) {
1312                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1313                 goto finish;
1314         }
1315
1316         log_close();
1317         n_fd_passed = sd_listen_fds(false);
1318         if (n_fd_passed > 0) {
1319                 k = fdset_new_listen_fds(&fds, false);
1320                 if (k < 0) {
1321                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1322                         goto finish;
1323                 }
1324         }
1325         fdset_close_others(fds);
1326         log_open();
1327
1328         k = cg_get_machine_path(arg_machine, &newcg);
1329         if (k < 0) {
1330                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1331                 goto finish;
1332         }
1333
1334         k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1335         if (k <= 0 && k != -ENOENT) {
1336                 log_error("Container already running.");
1337
1338                 free(newcg);
1339                 newcg = NULL;
1340
1341                 goto finish;
1342         }
1343
1344         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1345         if (master < 0) {
1346                 log_error("Failed to acquire pseudo tty: %m");
1347                 goto finish;
1348         }
1349
1350         console = ptsname(master);
1351         if (!console) {
1352                 log_error("Failed to determine tty name: %m");
1353                 goto finish;
1354         }
1355
1356         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1357
1358         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1359                 ioctl(master, TIOCSWINSZ, &ws);
1360
1361         if (unlockpt(master) < 0) {
1362                 log_error("Failed to unlock tty: %m");
1363                 goto finish;
1364         }
1365
1366         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1367                 saved_attr_valid = true;
1368
1369                 raw_attr = saved_attr;
1370                 cfmakeraw(&raw_attr);
1371                 raw_attr.c_lflag &= ~ECHO;
1372         }
1373
1374         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1375                 log_error("Failed to create kmsg socket pair.");
1376                 goto finish;
1377         }
1378
1379         sd_notify(0, "READY=1");
1380
1381         assert_se(sigemptyset(&mask) == 0);
1382         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1383         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1384
1385         for (;;) {
1386                 siginfo_t status;
1387                 int pipefd[2], pipefd2[2];
1388
1389                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1390                         log_error("pipe2(): %m");
1391                         goto finish;
1392                 }
1393
1394                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1395                         log_error("pipe2(): %m");
1396                         close_pipe(pipefd);
1397                         goto finish;
1398                 }
1399
1400                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1401                 if (pid < 0) {
1402                         if (errno == EINVAL)
1403                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1404                         else
1405                                 log_error("clone() failed: %m");
1406
1407                         goto finish;
1408                 }
1409
1410                 if (pid == 0) {
1411                         /* child */
1412                         const char *home = NULL;
1413                         uid_t uid = (uid_t) -1;
1414                         gid_t gid = (gid_t) -1;
1415                         unsigned n_env = 2;
1416                         const char *envp[] = {
1417                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1418                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1419                                 NULL, /* TERM */
1420                                 NULL, /* HOME */
1421                                 NULL, /* USER */
1422                                 NULL, /* LOGNAME */
1423                                 NULL, /* container_uuid */
1424                                 NULL, /* LISTEN_FDS */
1425                                 NULL, /* LISTEN_PID */
1426                                 NULL
1427                         };
1428
1429                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1430                         if (envp[n_env])
1431                                 n_env ++;
1432
1433                         /* Wait for the parent process to log our PID */
1434                         close_nointr_nofail(pipefd[1]);
1435                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1436                         close_nointr_nofail(pipefd[0]);
1437
1438                         close_nointr_nofail(master);
1439                         master = -1;
1440
1441                         if (saved_attr_valid) {
1442                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1443                                         log_error("Failed to set terminal attributes: %m");
1444                                         goto child_fail;
1445                                 }
1446                         }
1447
1448                         close_nointr(STDIN_FILENO);
1449                         close_nointr(STDOUT_FILENO);
1450                         close_nointr(STDERR_FILENO);
1451
1452                         close_nointr_nofail(kmsg_socket_pair[0]);
1453                         kmsg_socket_pair[0] = -1;
1454
1455                         reset_all_signal_handlers();
1456
1457                         assert_se(sigemptyset(&mask) == 0);
1458                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1459
1460                         k = open_terminal(console, O_RDWR);
1461                         if (k != STDIN_FILENO) {
1462                                 if (k >= 0) {
1463                                         close_nointr_nofail(k);
1464                                         k = -EINVAL;
1465                                 }
1466
1467                                 log_error("Failed to open console: %s", strerror(-k));
1468                                 goto child_fail;
1469                         }
1470
1471                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1472                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1473                                 log_error("Failed to duplicate console: %m");
1474                                 goto child_fail;
1475                         }
1476
1477                         if (setsid() < 0) {
1478                                 log_error("setsid() failed: %m");
1479                                 goto child_fail;
1480                         }
1481
1482                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1483                                 log_error("PR_SET_PDEATHSIG failed: %m");
1484                                 goto child_fail;
1485                         }
1486
1487                         if (setup_cgroup(newcg) < 0)
1488                                 goto child_fail;
1489
1490                         close_pipe(pipefd2);
1491
1492                         /* Mark everything as slave, so that we still
1493                          * receive mounts from the real root, but don't
1494                          * propagate mounts to the real root. */
1495                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1496                                 log_error("MS_SLAVE|MS_REC failed: %m");
1497                                 goto child_fail;
1498                         }
1499
1500                         /* Turn directory into bind mount */
1501                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1502                                 log_error("Failed to make bind mount.");
1503                                 goto child_fail;
1504                         }
1505
1506                         if (arg_read_only)
1507                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1508                                         log_error("Failed to make read-only.");
1509                                         goto child_fail;
1510                                 }
1511
1512                         if (mount_all(arg_directory) < 0)
1513                                 goto child_fail;
1514
1515                         if (copy_devnodes(arg_directory) < 0)
1516                                 goto child_fail;
1517
1518                         if (setup_ptmx(arg_directory) < 0)
1519                                 goto child_fail;
1520
1521                         dev_setup(arg_directory);
1522
1523                         if (setup_dev_console(arg_directory, console) < 0)
1524                                 goto child_fail;
1525
1526                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1527                                 goto child_fail;
1528
1529                         close_nointr_nofail(kmsg_socket_pair[1]);
1530                         kmsg_socket_pair[1] = -1;
1531
1532                         if (setup_boot_id(arg_directory) < 0)
1533                                 goto child_fail;
1534
1535                         if (setup_timezone(arg_directory) < 0)
1536                                 goto child_fail;
1537
1538                         if (setup_resolv_conf(arg_directory) < 0)
1539                                 goto child_fail;
1540
1541                         if (setup_journal(arg_directory) < 0)
1542                                 goto child_fail;
1543
1544                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1545                                 goto child_fail;
1546
1547                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1548                                 goto child_fail;
1549
1550                         if (chdir(arg_directory) < 0) {
1551                                 log_error("chdir(%s) failed: %m", arg_directory);
1552                                 goto child_fail;
1553                         }
1554
1555                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1556                                 log_error("mount(MS_MOVE) failed: %m");
1557                                 goto child_fail;
1558                         }
1559
1560                         if (chroot(".") < 0) {
1561                                 log_error("chroot() failed: %m");
1562                                 goto child_fail;
1563                         }
1564
1565                         if (chdir("/") < 0) {
1566                                 log_error("chdir() failed: %m");
1567                                 goto child_fail;
1568                         }
1569
1570                         umask(0022);
1571
1572                         loopback_setup();
1573
1574                         if (drop_capabilities() < 0) {
1575                                 log_error("drop_capabilities() failed: %m");
1576                                 goto child_fail;
1577                         }
1578
1579                         if (arg_user) {
1580
1581                                 /* Note that this resolves user names
1582                                  * inside the container, and hence
1583                                  * accesses the NSS modules from the
1584                                  * container and not the host. This is
1585                                  * a bit weird... */
1586
1587                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1588                                         log_error("get_user_creds() failed: %m");
1589                                         goto child_fail;
1590                                 }
1591
1592                                 if (mkdir_parents_label(home, 0775) < 0) {
1593                                         log_error("mkdir_parents_label() failed: %m");
1594                                         goto child_fail;
1595                                 }
1596
1597                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1598                                         log_error("mkdir_safe_label() failed: %m");
1599                                         goto child_fail;
1600                                 }
1601
1602                                 if (initgroups((const char*)arg_user, gid) < 0) {
1603                                         log_error("initgroups() failed: %m");
1604                                         goto child_fail;
1605                                 }
1606
1607                                 if (setresgid(gid, gid, gid) < 0) {
1608                                         log_error("setregid() failed: %m");
1609                                         goto child_fail;
1610                                 }
1611
1612                                 if (setresuid(uid, uid, uid) < 0) {
1613                                         log_error("setreuid() failed: %m");
1614                                         goto child_fail;
1615                                 }
1616                         } else {
1617                                 /* Reset everything fully to 0, just in case */
1618
1619                                 if (setgroups(0, NULL) < 0) {
1620                                         log_error("setgroups() failed: %m");
1621                                         goto child_fail;
1622                                 }
1623
1624                                 if (setresgid(0, 0, 0) < 0) {
1625                                         log_error("setregid() failed: %m");
1626                                         goto child_fail;
1627                                 }
1628
1629                                 if (setresuid(0, 0, 0) < 0) {
1630                                         log_error("setreuid() failed: %m");
1631                                         goto child_fail;
1632                                 }
1633                         }
1634
1635                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1636                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1637                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1638                                 log_oom();
1639                                 goto child_fail;
1640                         }
1641
1642                         if (arg_uuid) {
1643                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1644                                         log_oom();
1645                                         goto child_fail;
1646                                 }
1647                         }
1648
1649                         if (fdset_size(fds) > 0) {
1650                                 k = fdset_cloexec(fds, false);
1651                                 if (k < 0) {
1652                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1653                                         goto child_fail;
1654                                 }
1655
1656                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1657                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1658                                         log_oom();
1659                                         goto child_fail;
1660                                 }
1661                         }
1662
1663                         setup_hostname();
1664
1665                         if (arg_boot) {
1666                                 char **a;
1667                                 size_t l;
1668
1669                                 /* Automatically search for the init system */
1670
1671                                 l = 1 + argc - optind;
1672                                 a = newa(char*, l + 1);
1673                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1674
1675                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1676                                 execve(a[0], a, (char**) envp);
1677
1678                                 a[0] = (char*) "/lib/systemd/systemd";
1679                                 execve(a[0], a, (char**) envp);
1680
1681                                 a[0] = (char*) "/sbin/init";
1682                                 execve(a[0], a, (char**) envp);
1683                         } else if (argc > optind)
1684                                 execvpe(argv[optind], argv + optind, (char**) envp);
1685                         else {
1686                                 chdir(home ? home : "/root");
1687                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1688                         }
1689
1690                         log_error("execv() failed: %m");
1691
1692                 child_fail:
1693                         _exit(EXIT_FAILURE);
1694                 }
1695
1696                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1697                 close_nointr_nofail(pipefd[0]);
1698                 close_nointr_nofail(pipefd[1]);
1699
1700                 /* Wait for the child process to establish cgroup hierarchy */
1701                 close_nointr_nofail(pipefd2[1]);
1702                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1703                 close_nointr_nofail(pipefd2[0]);
1704
1705                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1706
1707                 fdset_free(fds);
1708                 fds = NULL;
1709
1710                 if (process_pty(master, pid, &mask) < 0)
1711                         goto finish;
1712
1713                 if (saved_attr_valid)
1714                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1715
1716                 k = wait_for_terminate(pid, &status);
1717                 if (k < 0) {
1718                         r = EXIT_FAILURE;
1719                         break;
1720                 }
1721
1722                 if (status.si_code == CLD_EXITED) {
1723                         r = status.si_status;
1724                         if (status.si_status != 0) {
1725                                 log_error("Container failed with error code %i.", status.si_status);
1726                                 break;
1727                         }
1728
1729                         log_debug("Container exited successfully.");
1730                         break;
1731                 } else if (status.si_code == CLD_KILLED &&
1732                            status.si_status == SIGINT) {
1733                         log_info("Container has been shut down.");
1734                         r = 0;
1735                         break;
1736                 } else if (status.si_code == CLD_KILLED &&
1737                            status.si_status == SIGHUP) {
1738                         log_info("Container is being rebooted.");
1739                         continue;
1740                 } else if (status.si_code == CLD_KILLED ||
1741                            status.si_code == CLD_DUMPED) {
1742
1743                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1744                         r = EXIT_FAILURE;
1745                         break;
1746                 } else {
1747                         log_error("Container failed due to unknown reason.");
1748                         r = EXIT_FAILURE;
1749                         break;
1750                 }
1751         }
1752
1753 finish:
1754         if (saved_attr_valid)
1755                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1756
1757         close_pipe(kmsg_socket_pair);
1758
1759         if (newcg)
1760                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1761
1762         free(arg_directory);
1763         free(arg_machine);
1764         strv_free(arg_controllers);
1765
1766         fdset_free(fds);
1767
1768         return r;
1769 }