chiark / gitweb /
audit: since audit is apparently never going to be fixed for containers tell the...
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44
45 #ifdef HAVE_XATTR
46 #include <attr/xattr.h>
47 #endif
48
49 #include <systemd/sd-daemon.h>
50
51 #include "log.h"
52 #include "util.h"
53 #include "mkdir.h"
54 #include "macro.h"
55 #include "audit.h"
56 #include "missing.h"
57 #include "cgroup-util.h"
58 #include "strv.h"
59 #include "path-util.h"
60 #include "loopback-setup.h"
61 #include "sd-id128.h"
62 #include "dev-setup.h"
63 #include "fdset.h"
64 #include "build.h"
65 #include "fileio.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static char **arg_controllers = NULL;
81 static char *arg_uuid = NULL;
82 static char *arg_machine = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "  -C --controllers=LIST    Put the container in specified comma-separated\n"
126                "                           cgroup hierarchies\n"
127                "     --uuid=UUID           Set a specific machine UUID for the container\n"
128                "  -M --machine=NAME        Set the machine name for the container\n"
129                "     --private-network     Disable network in container\n"
130                "     --read-only           Mount the root directory read-only\n"
131                "     --capability=CAP      In addition to the default, retain specified\n"
132                "                           capability\n"
133                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
134                "  -j                       Equivalent to --link-journal=host\n"
135                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
136                "                           the container\n"
137                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
138                program_invocation_short_name);
139
140         return 0;
141 }
142
143 static int parse_argv(int argc, char *argv[]) {
144
145         enum {
146                 ARG_VERSION = 0x100,
147                 ARG_PRIVATE_NETWORK,
148                 ARG_UUID,
149                 ARG_READ_ONLY,
150                 ARG_CAPABILITY,
151                 ARG_LINK_JOURNAL,
152                 ARG_BIND,
153                 ARG_BIND_RO
154         };
155
156         static const struct option options[] = {
157                 { "help",            no_argument,       NULL, 'h'                 },
158                 { "version",         no_argument,       NULL, ARG_VERSION         },
159                 { "directory",       required_argument, NULL, 'D'                 },
160                 { "user",            required_argument, NULL, 'u'                 },
161                 { "controllers",     required_argument, NULL, 'C'                 },
162                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
163                 { "boot",            no_argument,       NULL, 'b'                 },
164                 { "uuid",            required_argument, NULL, ARG_UUID            },
165                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
166                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
167                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
168                 { "bind",            required_argument, NULL, ARG_BIND            },
169                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
170                 { "machine",         required_argument, NULL, 'M'                 },
171                 { NULL,              0,                 NULL, 0                   }
172         };
173
174         int c;
175
176         assert(argc >= 0);
177         assert(argv);
178
179         while ((c = getopt_long(argc, argv, "+hD:u:C:bM:j", options, NULL)) >= 0) {
180
181                 switch (c) {
182
183                 case 'h':
184                         help();
185                         return 0;
186
187                 case ARG_VERSION:
188                         puts(PACKAGE_STRING);
189                         puts(SYSTEMD_FEATURES);
190                         return 0;
191
192                 case 'D':
193                         free(arg_directory);
194                         arg_directory = canonicalize_file_name(optarg);
195                         if (!arg_directory) {
196                                 log_error("Failed to canonicalize root directory.");
197                                 return -ENOMEM;
198                         }
199
200                         break;
201
202                 case 'u':
203                         free(arg_user);
204                         arg_user = strdup(optarg);
205                         if (!arg_user)
206                                 return log_oom();
207
208                         break;
209
210                 case 'C':
211                         strv_free(arg_controllers);
212                         arg_controllers = strv_split(optarg, ",");
213                         if (!arg_controllers)
214                                 return log_oom();
215
216                         cg_shorten_controllers(arg_controllers);
217                         break;
218
219                 case ARG_PRIVATE_NETWORK:
220                         arg_private_network = true;
221                         break;
222
223                 case 'b':
224                         arg_boot = true;
225                         break;
226
227                 case ARG_UUID:
228                         if (!id128_is_valid(optarg)) {
229                                 log_error("Invalid UUID: %s", optarg);
230                                 return -EINVAL;
231                         }
232
233                         arg_uuid = optarg;
234                         break;
235
236                 case 'M':
237                         if (!hostname_is_valid(optarg)) {
238                                 log_error("Invalid machine name: %s", optarg);
239                                 return -EINVAL;
240                         }
241
242                         free(arg_machine);
243                         arg_machine = strdup(optarg);
244                         if (!arg_machine)
245                                 return log_oom();
246
247                         break;
248
249                 case ARG_READ_ONLY:
250                         arg_read_only = true;
251                         break;
252
253                 case ARG_CAPABILITY: {
254                         char *state, *word;
255                         size_t length;
256
257                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
258                                 cap_value_t cap;
259                                 char *t;
260
261                                 t = strndup(word, length);
262                                 if (!t)
263                                         return log_oom();
264
265                                 if (cap_from_name(t, &cap) < 0) {
266                                         log_error("Failed to parse capability %s.", t);
267                                         free(t);
268                                         return -EINVAL;
269                                 }
270
271                                 free(t);
272                                 arg_retain |= 1ULL << (uint64_t) cap;
273                         }
274
275                         break;
276                 }
277
278                 case 'j':
279                         arg_link_journal = LINK_GUEST;
280                         break;
281
282                 case ARG_LINK_JOURNAL:
283                         if (streq(optarg, "auto"))
284                                 arg_link_journal = LINK_AUTO;
285                         else if (streq(optarg, "no"))
286                                 arg_link_journal = LINK_NO;
287                         else if (streq(optarg, "guest"))
288                                 arg_link_journal = LINK_GUEST;
289                         else if (streq(optarg, "host"))
290                                 arg_link_journal = LINK_HOST;
291                         else {
292                                 log_error("Failed to parse link journal mode %s", optarg);
293                                 return -EINVAL;
294                         }
295
296                         break;
297
298                 case ARG_BIND:
299                 case ARG_BIND_RO: {
300                         _cleanup_free_ char *a = NULL, *b = NULL;
301                         char *e;
302                         char ***x;
303                         int r;
304
305                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
306
307                         e = strchr(optarg, ':');
308                         if (e) {
309                                 a = strndup(optarg, e - optarg);
310                                 b = strdup(e + 1);
311                         } else {
312                                 a = strdup(optarg);
313                                 b = strdup(optarg);
314                         }
315
316                         if (!a || !b)
317                                 return log_oom();
318
319                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
320                                 log_error("Invalid bind mount specification: %s", optarg);
321                                 return -EINVAL;
322                         }
323
324                         r = strv_extend(x, a);
325                         if (r < 0)
326                                 return r;
327
328                         r = strv_extend(x, b);
329                         if (r < 0)
330                                 return r;
331
332                         break;
333                 }
334
335                 case '?':
336                         return -EINVAL;
337
338                 default:
339                         log_error("Unknown option code %c", c);
340                         return -EINVAL;
341                 }
342         }
343
344         return 1;
345 }
346
347 static int mount_all(const char *dest) {
348
349         typedef struct MountPoint {
350                 const char *what;
351                 const char *where;
352                 const char *type;
353                 const char *options;
354                 unsigned long flags;
355                 bool fatal;
356         } MountPoint;
357
358         static const MountPoint mount_table[] = {
359                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
360                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
361                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
362                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
363                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
364                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
365                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
366                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
367 #ifdef HAVE_SELINUX
368                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
369                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
370 #endif
371         };
372
373         unsigned k;
374         int r = 0;
375
376         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
377                 _cleanup_free_ char *where = NULL;
378                 int t;
379
380                 where = strjoin(dest, "/", mount_table[k].where, NULL);
381                 if (!where)
382                         return log_oom();
383
384                 t = path_is_mount_point(where, true);
385                 if (t < 0) {
386                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
387
388                         if (r == 0)
389                                 r = t;
390
391                         continue;
392                 }
393
394                 /* Skip this entry if it is not a remount. */
395                 if (mount_table[k].what && t > 0)
396                         continue;
397
398                 mkdir_p(where, 0755);
399
400                 if (mount(mount_table[k].what,
401                           where,
402                           mount_table[k].type,
403                           mount_table[k].flags,
404                           mount_table[k].options) < 0 &&
405                     mount_table[k].fatal) {
406
407                         log_error("mount(%s) failed: %m", where);
408
409                         if (r == 0)
410                                 r = -errno;
411                 }
412         }
413
414         return r;
415 }
416
417 static int mount_binds(const char *dest, char **l, unsigned long flags) {
418         char **x, **y;
419
420         STRV_FOREACH_PAIR(x, y, l) {
421                 _cleanup_free_ char *where = NULL;
422
423                 where = strjoin(dest, "/", *y, NULL);
424                 if (!where)
425                         return log_oom();
426
427                 mkdir_p_label(where, 0755);
428
429                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
430                         log_error("mount(%s) failed: %m", where);
431                         return -errno;
432                 }
433
434                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
435                         log_error("mount(%s) failed: %m", where);
436                         return -errno;
437                 }
438         }
439
440         return 0;
441 }
442
443 static int setup_timezone(const char *dest) {
444         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
445         char *z, *y;
446         int r;
447
448         assert(dest);
449
450         /* Fix the timezone, if possible */
451         r = readlink_malloc("/etc/localtime", &p);
452         if (r < 0) {
453                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
454                 return 0;
455         }
456
457         z = path_startswith(p, "../usr/share/zoneinfo/");
458         if (!z)
459                 z = path_startswith(p, "/usr/share/zoneinfo/");
460         if (!z) {
461                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
462                 return 0;
463         }
464
465         where = strappend(dest, "/etc/localtime");
466         if (!where)
467                 return log_oom();
468
469         r = readlink_malloc(where, &q);
470         if (r >= 0) {
471                 y = path_startswith(q, "../usr/share/zoneinfo/");
472                 if (!y)
473                         y = path_startswith(q, "/usr/share/zoneinfo/");
474
475
476                 /* Already pointing to the right place? Then do nothing .. */
477                 if (y && streq(y, z))
478                         return 0;
479         }
480
481         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
482         if (!check)
483                 return log_oom();
484
485         if (access(check, F_OK) < 0) {
486                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
487                 return 0;
488         }
489
490         what = strappend("../usr/share/zoneinfo/", z);
491         if (!what)
492                 return log_oom();
493
494         unlink(where);
495         if (symlink(what, where) < 0) {
496                 log_error("Failed to correct timezone of container: %m");
497                 return 0;
498         }
499
500         return 0;
501 }
502
503 static int setup_resolv_conf(const char *dest) {
504         char _cleanup_free_ *where = NULL;
505         _cleanup_close_ int fd = -1;
506
507         assert(dest);
508
509         if (arg_private_network)
510                 return 0;
511
512         /* Fix resolv.conf, if possible */
513         where = strappend(dest, "/etc/resolv.conf");
514         if (!where)
515                 return log_oom();
516
517         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
518
519         /* We don't really care for the results of this really. If it
520          * fails, it fails, but meh... */
521         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
522                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
523         else
524                 if (mount("/etc/resolv.conf", where, "bind",
525                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
526                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
527                         return -errno;
528                 }
529
530         return 0;
531 }
532
533 static int setup_boot_id(const char *dest) {
534         _cleanup_free_ char *from = NULL, *to = NULL;
535         sd_id128_t rnd;
536         char as_uuid[37];
537         int r;
538
539         assert(dest);
540
541         /* Generate a new randomized boot ID, so that each boot-up of
542          * the container gets a new one */
543
544         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
545         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
546         if (!from || !to)
547                 return log_oom();
548
549         r = sd_id128_randomize(&rnd);
550         if (r < 0) {
551                 log_error("Failed to generate random boot id: %s", strerror(-r));
552                 return r;
553         }
554
555         snprintf(as_uuid, sizeof(as_uuid),
556                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
557                  SD_ID128_FORMAT_VAL(rnd));
558         char_array_0(as_uuid);
559
560         r = write_string_file(from, as_uuid);
561         if (r < 0) {
562                 log_error("Failed to write boot id: %s", strerror(-r));
563                 return r;
564         }
565
566         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
567                 log_error("Failed to bind mount boot id: %m");
568                 r = -errno;
569         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
570                 log_warning("Failed to make boot id read-only: %m");
571
572         unlink(from);
573         return r;
574 }
575
576 static int copy_devnodes(const char *dest) {
577
578         static const char devnodes[] =
579                 "null\0"
580                 "zero\0"
581                 "full\0"
582                 "random\0"
583                 "urandom\0"
584                 "tty\0";
585
586         const char *d;
587         int r = 0;
588         _cleanup_umask_ mode_t u;
589
590         assert(dest);
591
592         u = umask(0000);
593
594         NULSTR_FOREACH(d, devnodes) {
595                 struct stat st;
596                 _cleanup_free_ char *from = NULL, *to = NULL;
597
598                 asprintf(&from, "/dev/%s", d);
599                 asprintf(&to, "%s/dev/%s", dest, d);
600
601                 if (!from || !to) {
602                         log_oom();
603
604                         if (r == 0)
605                                 r = -ENOMEM;
606
607                         break;
608                 }
609
610                 if (stat(from, &st) < 0) {
611
612                         if (errno != ENOENT) {
613                                 log_error("Failed to stat %s: %m", from);
614                                 if (r == 0)
615                                         r = -errno;
616                         }
617
618                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
619
620                         log_error("%s is not a char or block device, cannot copy", from);
621                         if (r == 0)
622                                 r = -EIO;
623
624                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
625
626                         log_error("mknod(%s) failed: %m", dest);
627                         if (r == 0)
628                                 r = -errno;
629                 }
630         }
631
632         return r;
633 }
634
635 static int setup_ptmx(const char *dest) {
636         _cleanup_free_ char *p = NULL;
637
638         p = strappend(dest, "/dev/ptmx");
639         if (!p)
640                 return log_oom();
641
642         if (symlink("pts/ptmx", p) < 0) {
643                 log_error("Failed to create /dev/ptmx symlink: %m");
644                 return -errno;
645         }
646
647         return 0;
648 }
649
650 static int setup_dev_console(const char *dest, const char *console) {
651         struct stat st;
652         _cleanup_free_ char *to = NULL;
653         int r;
654         _cleanup_umask_ mode_t u;
655
656         assert(dest);
657         assert(console);
658
659         u = umask(0000);
660
661         if (stat(console, &st) < 0) {
662                 log_error("Failed to stat %s: %m", console);
663                 return -errno;
664
665         } else if (!S_ISCHR(st.st_mode)) {
666                 log_error("/dev/console is not a char device");
667                 return -EIO;
668         }
669
670         r = chmod_and_chown(console, 0600, 0, 0);
671         if (r < 0) {
672                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
673                 return r;
674         }
675
676         if (asprintf(&to, "%s/dev/console", dest) < 0)
677                 return log_oom();
678
679         /* We need to bind mount the right tty to /dev/console since
680          * ptys can only exist on pts file systems. To have something
681          * to bind mount things on we create a device node first, that
682          * has the right major/minor (note that the major minor
683          * doesn't actually matter here, since we mount it over
684          * anyway). */
685
686         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
687                 log_error("mknod() for /dev/console failed: %m");
688                 return -errno;
689         }
690
691         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
692                 log_error("Bind mount for /dev/console failed: %m");
693                 return -errno;
694         }
695
696         return 0;
697 }
698
699 static int setup_kmsg(const char *dest, int kmsg_socket) {
700         _cleanup_free_ char *from = NULL, *to = NULL;
701         int r, fd, k;
702         _cleanup_umask_ mode_t u;
703         union {
704                 struct cmsghdr cmsghdr;
705                 uint8_t buf[CMSG_SPACE(sizeof(int))];
706         } control = {};
707         struct msghdr mh = {
708                 .msg_control = &control,
709                 .msg_controllen = sizeof(control),
710         };
711         struct cmsghdr *cmsg;
712
713         assert(dest);
714         assert(kmsg_socket >= 0);
715
716         u = umask(0000);
717
718         /* We create the kmsg FIFO as /dev/kmsg, but immediately
719          * delete it after bind mounting it to /proc/kmsg. While FIFOs
720          * on the reading side behave very similar to /proc/kmsg,
721          * their writing side behaves differently from /dev/kmsg in
722          * that writing blocks when nothing is reading. In order to
723          * avoid any problems with containers deadlocking due to this
724          * we simply make /dev/kmsg unavailable to the container. */
725         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
726             asprintf(&to, "%s/proc/kmsg", dest) < 0)
727                 return log_oom();
728
729         if (mkfifo(from, 0600) < 0) {
730                 log_error("mkfifo() for /dev/kmsg failed: %m");
731                 return -errno;
732         }
733
734         r = chmod_and_chown(from, 0600, 0, 0);
735         if (r < 0) {
736                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
737                 return r;
738         }
739
740         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
741                 log_error("Bind mount for /proc/kmsg failed: %m");
742                 return -errno;
743         }
744
745         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
746         if (fd < 0) {
747                 log_error("Failed to open fifo: %m");
748                 return -errno;
749         }
750
751         cmsg = CMSG_FIRSTHDR(&mh);
752         cmsg->cmsg_level = SOL_SOCKET;
753         cmsg->cmsg_type = SCM_RIGHTS;
754         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
755         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
756
757         mh.msg_controllen = cmsg->cmsg_len;
758
759         /* Store away the fd in the socket, so that it stays open as
760          * long as we run the child */
761         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
762         close_nointr_nofail(fd);
763
764         if (k < 0) {
765                 log_error("Failed to send FIFO fd: %m");
766                 return -errno;
767         }
768
769         /* And now make the FIFO unavailable as /dev/kmsg... */
770         unlink(from);
771         return 0;
772 }
773
774 static int setup_hostname(void) {
775
776         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
777                 return -errno;
778
779         return 0;
780 }
781
782 static int setup_journal(const char *directory) {
783         sd_id128_t machine_id;
784         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
785         char *id;
786         int r;
787
788         if (arg_link_journal == LINK_NO)
789                 return 0;
790
791         p = strappend(directory, "/etc/machine-id");
792         if (!p)
793                 return log_oom();
794
795         r = read_one_line_file(p, &b);
796         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
797                 return 0;
798         else if (r < 0) {
799                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
800                 return r;
801         }
802
803         id = strstrip(b);
804         if (isempty(id) && arg_link_journal == LINK_AUTO)
805                 return 0;
806
807         /* Verify validity */
808         r = sd_id128_from_string(id, &machine_id);
809         if (r < 0) {
810                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
811                 return r;
812         }
813
814         free(p);
815         p = strappend("/var/log/journal/", id);
816         q = strjoin(directory, "/var/log/journal/", id, NULL);
817         if (!p || !q)
818                 return log_oom();
819
820         if (path_is_mount_point(p, false) > 0) {
821                 if (arg_link_journal != LINK_AUTO) {
822                         log_error("%s: already a mount point, refusing to use for journal", p);
823                         return -EEXIST;
824                 }
825
826                 return 0;
827         }
828
829         if (path_is_mount_point(q, false) > 0) {
830                 if (arg_link_journal != LINK_AUTO) {
831                         log_error("%s: already a mount point, refusing to use for journal", q);
832                         return -EEXIST;
833                 }
834
835                 return 0;
836         }
837
838         r = readlink_and_make_absolute(p, &d);
839         if (r >= 0) {
840                 if ((arg_link_journal == LINK_GUEST ||
841                      arg_link_journal == LINK_AUTO) &&
842                     path_equal(d, q)) {
843
844                         r = mkdir_p(q, 0755);
845                         if (r < 0)
846                                 log_warning("failed to create directory %s: %m", q);
847                         return 0;
848                 }
849
850                 if (unlink(p) < 0) {
851                         log_error("Failed to remove symlink %s: %m", p);
852                         return -errno;
853                 }
854         } else if (r == -EINVAL) {
855
856                 if (arg_link_journal == LINK_GUEST &&
857                     rmdir(p) < 0) {
858
859                         if (errno == ENOTDIR) {
860                                 log_error("%s already exists and is neither a symlink nor a directory", p);
861                                 return r;
862                         } else {
863                                 log_error("Failed to remove %s: %m", p);
864                                 return -errno;
865                         }
866                 }
867         } else if (r != -ENOENT) {
868                 log_error("readlink(%s) failed: %m", p);
869                 return r;
870         }
871
872         if (arg_link_journal == LINK_GUEST) {
873
874                 if (symlink(q, p) < 0) {
875                         log_error("Failed to symlink %s to %s: %m", q, p);
876                         return -errno;
877                 }
878
879                 r = mkdir_p(q, 0755);
880                 if (r < 0)
881                         log_warning("failed to create directory %s: %m", q);
882                 return 0;
883         }
884
885         if (arg_link_journal == LINK_HOST) {
886                 r = mkdir_p(p, 0755);
887                 if (r < 0) {
888                         log_error("Failed to create %s: %m", p);
889                         return r;
890                 }
891
892         } else if (access(p, F_OK) < 0)
893                 return 0;
894
895         if (dir_is_empty(q) == 0) {
896                 log_error("%s not empty.", q);
897                 return -ENOTEMPTY;
898         }
899
900         r = mkdir_p(q, 0755);
901         if (r < 0) {
902                 log_error("Failed to create %s: %m", q);
903                 return r;
904         }
905
906         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
907                 log_error("Failed to bind mount journal from host into guest: %m");
908                 return -errno;
909         }
910
911         return 0;
912 }
913
914 static int setup_cgroup(const char *path) {
915         char **c;
916         int r;
917
918         r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, path, 1);
919         if (r < 0) {
920                 log_error("Failed to create cgroup: %s", strerror(-r));
921                 return r;
922         }
923
924         STRV_FOREACH(c, arg_controllers) {
925                 r = cg_create_and_attach(*c, path, 1);
926                 if (r < 0)
927                         log_warning("Failed to create cgroup in controller %s: %s", *c, strerror(-r));
928         }
929
930         return 0;
931 }
932
933 static int save_attributes(const char *cgroup, pid_t pid, const char *uuid, const char *directory) {
934 #ifdef HAVE_XATTR
935         _cleanup_free_ char *path = NULL;
936         char buf[DECIMAL_STR_MAX(pid_t)];
937         int r = 0, k;
938
939         assert(cgroup);
940         assert(pid >= 0);
941         assert(arg_directory);
942
943         assert_se(snprintf(buf, sizeof(buf), "%lu", (unsigned long) pid) < (int) sizeof(buf));
944
945         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, cgroup, NULL, &path);
946         if (r < 0) {
947                 log_error("Failed to get path: %s", strerror(-r));
948                 return r;
949         }
950
951         r = setxattr(path, "trusted.init_pid", buf, strlen(buf), XATTR_CREATE);
952         if (r < 0)
953                 log_warning("Failed to set %s attribute on %s: %m", "trusted.init_pid", path);
954
955         if (uuid) {
956                 k = setxattr(path, "trusted.machine_id", uuid, strlen(uuid), XATTR_CREATE);
957                 if (k < 0) {
958                         log_warning("Failed to set %s attribute on %s: %m", "trusted.machine_id", path);
959                         if (r == 0)
960                                 r = k;
961                 }
962         }
963
964         k = setxattr(path, "trusted.root_directory", directory, strlen(directory), XATTR_CREATE);
965         if (k < 0) {
966                 log_warning("Failed to set %s attribute on %s: %m", "trusted.root_directory", path);
967                 if (r == 0)
968                         r = k;
969         }
970         return r;
971 #else
972         return 0;
973 #endif
974 }
975
976 static int drop_capabilities(void) {
977         return capability_bounding_set_drop(~arg_retain, false);
978 }
979
980 static int process_pty(int master, pid_t pid, sigset_t *mask) {
981
982         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
983         size_t in_buffer_full = 0, out_buffer_full = 0;
984         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
985         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
986         int ep = -1, signal_fd = -1, r;
987         bool tried_orderly_shutdown = false;
988
989         assert(master >= 0);
990         assert(pid > 0);
991         assert(mask);
992
993         fd_nonblock(STDIN_FILENO, 1);
994         fd_nonblock(STDOUT_FILENO, 1);
995         fd_nonblock(master, 1);
996
997         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
998         if (signal_fd < 0) {
999                 log_error("signalfd(): %m");
1000                 r = -errno;
1001                 goto finish;
1002         }
1003
1004         ep = epoll_create1(EPOLL_CLOEXEC);
1005         if (ep < 0) {
1006                 log_error("Failed to create epoll: %m");
1007                 r = -errno;
1008                 goto finish;
1009         }
1010
1011         /* We read from STDIN only if this is actually a TTY,
1012          * otherwise we assume non-interactivity. */
1013         if (isatty(STDIN_FILENO)) {
1014                 zero(stdin_ev);
1015                 stdin_ev.events = EPOLLIN|EPOLLET;
1016                 stdin_ev.data.fd = STDIN_FILENO;
1017
1018                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
1019                         log_error("Failed to register STDIN in epoll: %m");
1020                         r = -errno;
1021                         goto finish;
1022                 }
1023         }
1024
1025         zero(stdout_ev);
1026         stdout_ev.events = EPOLLOUT|EPOLLET;
1027         stdout_ev.data.fd = STDOUT_FILENO;
1028
1029         zero(master_ev);
1030         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
1031         master_ev.data.fd = master;
1032
1033         zero(signal_ev);
1034         signal_ev.events = EPOLLIN;
1035         signal_ev.data.fd = signal_fd;
1036
1037         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
1038                 if (errno != EPERM) {
1039                         log_error("Failed to register stdout in epoll: %m");
1040                         r = -errno;
1041                         goto finish;
1042                 }
1043                 /* stdout without epoll support. Likely redirected to regular file. */
1044                 stdout_writable = true;
1045         }
1046
1047         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1048             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1049                 log_error("Failed to register fds in epoll: %m");
1050                 r = -errno;
1051                 goto finish;
1052         }
1053
1054         for (;;) {
1055                 struct epoll_event ev[16];
1056                 ssize_t k;
1057                 int i, nfds;
1058
1059                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1060                 if (nfds < 0) {
1061
1062                         if (errno == EINTR || errno == EAGAIN)
1063                                 continue;
1064
1065                         log_error("epoll_wait(): %m");
1066                         r = -errno;
1067                         goto finish;
1068                 }
1069
1070                 assert(nfds >= 1);
1071
1072                 for (i = 0; i < nfds; i++) {
1073                         if (ev[i].data.fd == STDIN_FILENO) {
1074
1075                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1076                                         stdin_readable = true;
1077
1078                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1079
1080                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1081                                         stdout_writable = true;
1082
1083                         } else if (ev[i].data.fd == master) {
1084
1085                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1086                                         master_readable = true;
1087
1088                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1089                                         master_writable = true;
1090
1091                         } else if (ev[i].data.fd == signal_fd) {
1092                                 struct signalfd_siginfo sfsi;
1093                                 ssize_t n;
1094
1095                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1096                                 if (n != sizeof(sfsi)) {
1097
1098                                         if (n >= 0) {
1099                                                 log_error("Failed to read from signalfd: invalid block size");
1100                                                 r = -EIO;
1101                                                 goto finish;
1102                                         }
1103
1104                                         if (errno != EINTR && errno != EAGAIN) {
1105                                                 log_error("Failed to read from signalfd: %m");
1106                                                 r = -errno;
1107                                                 goto finish;
1108                                         }
1109                                 } else {
1110
1111                                         if (sfsi.ssi_signo == SIGWINCH) {
1112                                                 struct winsize ws;
1113
1114                                                 /* The window size changed, let's forward that. */
1115                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1116                                                         ioctl(master, TIOCSWINSZ, &ws);
1117                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1118
1119                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1120
1121                                                 /* This only works for systemd... */
1122                                                 tried_orderly_shutdown = true;
1123                                                 kill(pid, SIGRTMIN+3);
1124
1125                                         } else {
1126                                                 r = 0;
1127                                                 goto finish;
1128                                         }
1129                                 }
1130                         }
1131                 }
1132
1133                 while ((stdin_readable && in_buffer_full <= 0) ||
1134                        (master_writable && in_buffer_full > 0) ||
1135                        (master_readable && out_buffer_full <= 0) ||
1136                        (stdout_writable && out_buffer_full > 0)) {
1137
1138                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1139
1140                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1141                                 if (k < 0) {
1142
1143                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1144                                                 stdin_readable = false;
1145                                         else {
1146                                                 log_error("read(): %m");
1147                                                 r = -errno;
1148                                                 goto finish;
1149                                         }
1150                                 } else
1151                                         in_buffer_full += (size_t) k;
1152                         }
1153
1154                         if (master_writable && in_buffer_full > 0) {
1155
1156                                 k = write(master, in_buffer, in_buffer_full);
1157                                 if (k < 0) {
1158
1159                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1160                                                 master_writable = false;
1161                                         else {
1162                                                 log_error("write(): %m");
1163                                                 r = -errno;
1164                                                 goto finish;
1165                                         }
1166
1167                                 } else {
1168                                         assert(in_buffer_full >= (size_t) k);
1169                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1170                                         in_buffer_full -= k;
1171                                 }
1172                         }
1173
1174                         if (master_readable && out_buffer_full < LINE_MAX) {
1175
1176                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1177                                 if (k < 0) {
1178
1179                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1180                                                 master_readable = false;
1181                                         else {
1182                                                 log_error("read(): %m");
1183                                                 r = -errno;
1184                                                 goto finish;
1185                                         }
1186                                 }  else
1187                                         out_buffer_full += (size_t) k;
1188                         }
1189
1190                         if (stdout_writable && out_buffer_full > 0) {
1191
1192                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1193                                 if (k < 0) {
1194
1195                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1196                                                 stdout_writable = false;
1197                                         else {
1198                                                 log_error("write(): %m");
1199                                                 r = -errno;
1200                                                 goto finish;
1201                                         }
1202
1203                                 } else {
1204                                         assert(out_buffer_full >= (size_t) k);
1205                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1206                                         out_buffer_full -= k;
1207                                 }
1208                         }
1209                 }
1210         }
1211
1212 finish:
1213         if (ep >= 0)
1214                 close_nointr_nofail(ep);
1215
1216         if (signal_fd >= 0)
1217                 close_nointr_nofail(signal_fd);
1218
1219         return r;
1220 }
1221
1222 static bool audit_enabled(void) {
1223         int fd;
1224
1225         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1226         if (fd >= 0) {
1227                 close_nointr_nofail(fd);
1228                 return true;
1229         }
1230
1231         return false;
1232 }
1233
1234 int main(int argc, char *argv[]) {
1235         pid_t pid = 0;
1236         int r = EXIT_FAILURE, k;
1237         _cleanup_free_ char *newcg = NULL;
1238         _cleanup_close_ int master = -1;
1239         int n_fd_passed;
1240         const char *console = NULL;
1241         struct termios saved_attr, raw_attr;
1242         sigset_t mask;
1243         bool saved_attr_valid = false;
1244         struct winsize ws;
1245         int kmsg_socket_pair[2] = { -1, -1 };
1246         FDSet *fds = NULL;
1247
1248         log_parse_environment();
1249         log_open();
1250
1251         k = parse_argv(argc, argv);
1252         if (k < 0)
1253                 goto finish;
1254         else if (k == 0) {
1255                 r = EXIT_SUCCESS;
1256                 goto finish;
1257         }
1258
1259         if (arg_directory) {
1260                 char *p;
1261
1262                 p = path_make_absolute_cwd(arg_directory);
1263                 free(arg_directory);
1264                 arg_directory = p;
1265         } else
1266                 arg_directory = get_current_dir_name();
1267
1268         if (!arg_directory) {
1269                 log_error("Failed to determine path, please use -D.");
1270                 goto finish;
1271         }
1272
1273         path_kill_slashes(arg_directory);
1274
1275         if (!arg_machine) {
1276                 arg_machine = strdup(path_get_file_name(arg_directory));
1277                 if (!arg_machine) {
1278                         log_oom();
1279                         goto finish;
1280                 }
1281
1282                 hostname_cleanup(arg_machine, false);
1283                 if (isempty(arg_machine)) {
1284                         log_error("Failed to determine machine name automatically, please use -M.");
1285                         goto finish;
1286                 }
1287         }
1288
1289         if (geteuid() != 0) {
1290                 log_error("Need to be root.");
1291                 goto finish;
1292         }
1293
1294         if (sd_booted() <= 0) {
1295                 log_error("Not running on a systemd system.");
1296                 goto finish;
1297         }
1298
1299         if (audit_enabled()) {
1300                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1301                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1302                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1303                 sleep(5);
1304         }
1305
1306         if (path_equal(arg_directory, "/")) {
1307                 log_error("Spawning container on root directory not supported.");
1308                 goto finish;
1309         }
1310
1311         if (path_is_os_tree(arg_directory) <= 0) {
1312                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1313                 goto finish;
1314         }
1315
1316         log_close();
1317         n_fd_passed = sd_listen_fds(false);
1318         if (n_fd_passed > 0) {
1319                 k = fdset_new_listen_fds(&fds, false);
1320                 if (k < 0) {
1321                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1322                         goto finish;
1323                 }
1324         }
1325         fdset_close_others(fds);
1326         log_open();
1327
1328         k = cg_get_machine_path(arg_machine, &newcg);
1329         if (k < 0) {
1330                 log_error("Failed to determine machine cgroup path: %s", strerror(-k));
1331                 goto finish;
1332         }
1333
1334         k = cg_is_empty_recursive(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1335         if (k <= 0 && k != -ENOENT) {
1336                 log_error("Container already running.");
1337
1338                 free(newcg);
1339                 newcg = NULL;
1340
1341                 goto finish;
1342         }
1343
1344         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1345         if (master < 0) {
1346                 log_error("Failed to acquire pseudo tty: %m");
1347                 goto finish;
1348         }
1349
1350         console = ptsname(master);
1351         if (!console) {
1352                 log_error("Failed to determine tty name: %m");
1353                 goto finish;
1354         }
1355
1356         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1357
1358         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1359                 ioctl(master, TIOCSWINSZ, &ws);
1360
1361         if (unlockpt(master) < 0) {
1362                 log_error("Failed to unlock tty: %m");
1363                 goto finish;
1364         }
1365
1366         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1367                 saved_attr_valid = true;
1368
1369                 raw_attr = saved_attr;
1370                 cfmakeraw(&raw_attr);
1371                 raw_attr.c_lflag &= ~ECHO;
1372         }
1373
1374         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1375                 log_error("Failed to create kmsg socket pair.");
1376                 goto finish;
1377         }
1378
1379         sd_notify(0, "READY=1");
1380
1381         assert_se(sigemptyset(&mask) == 0);
1382         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1383         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1384
1385         for (;;) {
1386                 siginfo_t status;
1387                 int pipefd[2], pipefd2[2];
1388
1389                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1390                         log_error("pipe2(): %m");
1391                         goto finish;
1392                 }
1393
1394                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1395                         log_error("pipe2(): %m");
1396                         close_pipe(pipefd);
1397                         goto finish;
1398                 }
1399
1400                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1401                 if (pid < 0) {
1402                         if (errno == EINVAL)
1403                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1404                         else
1405                                 log_error("clone() failed: %m");
1406
1407                         goto finish;
1408                 }
1409
1410                 if (pid == 0) {
1411                         /* child */
1412                         const char *home = NULL;
1413                         uid_t uid = (uid_t) -1;
1414                         gid_t gid = (gid_t) -1;
1415                         unsigned n_env = 2;
1416                         const char *envp[] = {
1417                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1418                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1419                                 NULL, /* TERM */
1420                                 NULL, /* HOME */
1421                                 NULL, /* USER */
1422                                 NULL, /* LOGNAME */
1423                                 NULL, /* container_uuid */
1424                                 NULL, /* LISTEN_FDS */
1425                                 NULL, /* LISTEN_PID */
1426                                 NULL
1427                         };
1428
1429                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1430                         if (envp[n_env])
1431                                 n_env ++;
1432
1433                         /* Wait for the parent process to log our PID */
1434                         close_nointr_nofail(pipefd[1]);
1435                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1436                         close_nointr_nofail(pipefd[0]);
1437
1438                         close_nointr_nofail(master);
1439                         master = -1;
1440
1441                         if (saved_attr_valid) {
1442                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1443                                         log_error("Failed to set terminal attributes: %m");
1444                                         goto child_fail;
1445                                 }
1446                         }
1447
1448                         close_nointr(STDIN_FILENO);
1449                         close_nointr(STDOUT_FILENO);
1450                         close_nointr(STDERR_FILENO);
1451
1452                         close_nointr_nofail(kmsg_socket_pair[0]);
1453                         kmsg_socket_pair[0] = -1;
1454
1455                         reset_all_signal_handlers();
1456
1457                         assert_se(sigemptyset(&mask) == 0);
1458                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1459
1460                         k = open_terminal(console, O_RDWR);
1461                         if (k != STDIN_FILENO) {
1462                                 if (k >= 0) {
1463                                         close_nointr_nofail(k);
1464                                         k = -EINVAL;
1465                                 }
1466
1467                                 log_error("Failed to open console: %s", strerror(-k));
1468                                 goto child_fail;
1469                         }
1470
1471                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1472                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1473                                 log_error("Failed to duplicate console: %m");
1474                                 goto child_fail;
1475                         }
1476
1477                         if (setsid() < 0) {
1478                                 log_error("setsid() failed: %m");
1479                                 goto child_fail;
1480                         }
1481
1482                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1483                                 log_error("PR_SET_PDEATHSIG failed: %m");
1484                                 goto child_fail;
1485                         }
1486
1487                         if (setup_cgroup(newcg) < 0)
1488                                 goto child_fail;
1489
1490                         close_pipe(pipefd2);
1491
1492                         /* Mark everything as slave, so that we still
1493                          * receive mounts from the real root, but don't
1494                          * propagate mounts to the real root. */
1495                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1496                                 log_error("MS_SLAVE|MS_REC failed: %m");
1497                                 goto child_fail;
1498                         }
1499
1500                         /* Turn directory into bind mount */
1501                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1502                                 log_error("Failed to make bind mount.");
1503                                 goto child_fail;
1504                         }
1505
1506                         if (arg_read_only)
1507                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1508                                         log_error("Failed to make read-only.");
1509                                         goto child_fail;
1510                                 }
1511
1512                         if (mount_all(arg_directory) < 0)
1513                                 goto child_fail;
1514
1515                         if (copy_devnodes(arg_directory) < 0)
1516                                 goto child_fail;
1517
1518                         if (setup_ptmx(arg_directory) < 0)
1519                                 goto child_fail;
1520
1521                         dev_setup(arg_directory);
1522
1523                         if (setup_dev_console(arg_directory, console) < 0)
1524                                 goto child_fail;
1525
1526                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1527                                 goto child_fail;
1528
1529                         close_nointr_nofail(kmsg_socket_pair[1]);
1530                         kmsg_socket_pair[1] = -1;
1531
1532                         if (setup_boot_id(arg_directory) < 0)
1533                                 goto child_fail;
1534
1535                         if (setup_timezone(arg_directory) < 0)
1536                                 goto child_fail;
1537
1538                         if (setup_resolv_conf(arg_directory) < 0)
1539                                 goto child_fail;
1540
1541                         if (setup_journal(arg_directory) < 0)
1542                                 goto child_fail;
1543
1544                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1545                                 goto child_fail;
1546
1547                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1548                                 goto child_fail;
1549
1550                         if (chdir(arg_directory) < 0) {
1551                                 log_error("chdir(%s) failed: %m", arg_directory);
1552                                 goto child_fail;
1553                         }
1554
1555                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1556                                 log_error("mount(MS_MOVE) failed: %m");
1557                                 goto child_fail;
1558                         }
1559
1560                         if (chroot(".") < 0) {
1561                                 log_error("chroot() failed: %m");
1562                                 goto child_fail;
1563                         }
1564
1565                         if (chdir("/") < 0) {
1566                                 log_error("chdir() failed: %m");
1567                                 goto child_fail;
1568                         }
1569
1570                         umask(0022);
1571
1572                         loopback_setup();
1573
1574                         if (drop_capabilities() < 0) {
1575                                 log_error("drop_capabilities() failed: %m");
1576                                 goto child_fail;
1577                         }
1578
1579                         if (arg_user) {
1580
1581                                 /* Note that this resolves user names
1582                                  * inside the container, and hence
1583                                  * accesses the NSS modules from the
1584                                  * container and not the host. This is
1585                                  * a bit weird... */
1586
1587                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1588                                         log_error("get_user_creds() failed: %m");
1589                                         goto child_fail;
1590                                 }
1591
1592                                 if (mkdir_parents_label(home, 0775) < 0) {
1593                                         log_error("mkdir_parents_label() failed: %m");
1594                                         goto child_fail;
1595                                 }
1596
1597                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1598                                         log_error("mkdir_safe_label() failed: %m");
1599                                         goto child_fail;
1600                                 }
1601
1602                                 if (initgroups((const char*)arg_user, gid) < 0) {
1603                                         log_error("initgroups() failed: %m");
1604                                         goto child_fail;
1605                                 }
1606
1607                                 if (setresgid(gid, gid, gid) < 0) {
1608                                         log_error("setregid() failed: %m");
1609                                         goto child_fail;
1610                                 }
1611
1612                                 if (setresuid(uid, uid, uid) < 0) {
1613                                         log_error("setreuid() failed: %m");
1614                                         goto child_fail;
1615                                 }
1616                         } else {
1617                                 /* Reset everything fully to 0, just in case */
1618
1619                                 if (setgroups(0, NULL) < 0) {
1620                                         log_error("setgroups() failed: %m");
1621                                         goto child_fail;
1622                                 }
1623
1624                                 if (setresgid(0, 0, 0) < 0) {
1625                                         log_error("setregid() failed: %m");
1626                                         goto child_fail;
1627                                 }
1628
1629                                 if (setresuid(0, 0, 0) < 0) {
1630                                         log_error("setreuid() failed: %m");
1631                                         goto child_fail;
1632                                 }
1633                         }
1634
1635                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1636                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1637                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1638                                 log_oom();
1639                                 goto child_fail;
1640                         }
1641
1642                         if (arg_uuid) {
1643                                 if (asprintf((char**)(envp + n_env++), "container_uuid=%s", arg_uuid) < 0) {
1644                                         log_oom();
1645                                         goto child_fail;
1646                                 }
1647                         }
1648
1649                         if (fdset_size(fds) > 0) {
1650                                 k = fdset_cloexec(fds, false);
1651                                 if (k < 0) {
1652                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1653                                         goto child_fail;
1654                                 }
1655
1656                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1657                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=%lu", (unsigned long) 1) < 0)) {
1658                                         log_oom();
1659                                         goto child_fail;
1660                                 }
1661                         }
1662
1663                         setup_hostname();
1664
1665                         if (arg_boot) {
1666                                 char **a;
1667                                 size_t l;
1668
1669                                 /* Automatically search for the init system */
1670
1671                                 l = 1 + argc - optind;
1672                                 a = newa(char*, l + 1);
1673                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1674
1675                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1676                                 execve(a[0], a, (char**) envp);
1677
1678                                 a[0] = (char*) "/lib/systemd/systemd";
1679                                 execve(a[0], a, (char**) envp);
1680
1681                                 a[0] = (char*) "/sbin/init";
1682                                 execve(a[0], a, (char**) envp);
1683                         } else if (argc > optind)
1684                                 execvpe(argv[optind], argv + optind, (char**) envp);
1685                         else {
1686                                 chdir(home ? home : "/root");
1687                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1688                         }
1689
1690                         log_error("execv() failed: %m");
1691
1692                 child_fail:
1693                         _exit(EXIT_FAILURE);
1694                 }
1695
1696                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1697                 close_nointr_nofail(pipefd[0]);
1698                 close_nointr_nofail(pipefd[1]);
1699
1700                 /* Wait for the child process to establish cgroup hierarchy */
1701                 close_nointr_nofail(pipefd2[1]);
1702                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1703                 close_nointr_nofail(pipefd2[0]);
1704
1705                 save_attributes(newcg, pid, arg_uuid, arg_directory);
1706
1707                 fdset_free(fds);
1708                 fds = NULL;
1709
1710                 if (process_pty(master, pid, &mask) < 0)
1711                         goto finish;
1712
1713                 if (saved_attr_valid)
1714                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1715
1716                 k = wait_for_terminate(pid, &status);
1717                 if (k < 0) {
1718                         r = EXIT_FAILURE;
1719                         break;
1720                 }
1721
1722                 if (status.si_code == CLD_EXITED) {
1723                         r = status.si_status;
1724                         if (status.si_status != 0) {
1725                                 log_error("Container failed with error code %i.", status.si_status);
1726                                 break;
1727                         }
1728
1729                         log_debug("Container exited successfully.");
1730                         break;
1731                 } else if (status.si_code == CLD_KILLED &&
1732                            status.si_status == SIGINT) {
1733                         log_info("Container has been shut down.");
1734                         r = 0;
1735                         break;
1736                 } else if (status.si_code == CLD_KILLED &&
1737                            status.si_status == SIGHUP) {
1738                         log_info("Container is being rebooted.");
1739                         continue;
1740                 } else if (status.si_code == CLD_KILLED ||
1741                            status.si_code == CLD_DUMPED) {
1742
1743                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1744                         r = EXIT_FAILURE;
1745                         break;
1746                 } else {
1747                         log_error("Container failed due to unknown reason.");
1748                         r = EXIT_FAILURE;
1749                         break;
1750                 }
1751         }
1752
1753 finish:
1754         if (saved_attr_valid)
1755                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1756
1757         close_pipe(kmsg_socket_pair);
1758
1759         if (newcg)
1760                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1761
1762         free(arg_directory);
1763         free(arg_machine);
1764         strv_free(arg_controllers);
1765
1766         fdset_free(fds);
1767
1768         return r;
1769 }