chiark / gitweb /
a0809da7439cd730833137a2e0fee66a33cf3ecd
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/poll.h>
37 #include <sys/epoll.h>
38 #include <termios.h>
39 #include <sys/signalfd.h>
40 #include <grp.h>
41 #include <linux/fs.h>
42 #include <sys/un.h>
43 #include <sys/socket.h>
44 #include <linux/netlink.h>
45
46 #include <systemd/sd-daemon.h>
47 #include <systemd/sd-bus.h>
48
49 #include "log.h"
50 #include "util.h"
51 #include "mkdir.h"
52 #include "macro.h"
53 #include "audit.h"
54 #include "missing.h"
55 #include "cgroup-util.h"
56 #include "strv.h"
57 #include "path-util.h"
58 #include "loopback-setup.h"
59 #include "sd-id128.h"
60 #include "dev-setup.h"
61 #include "fdset.h"
62 #include "build.h"
63 #include "fileio.h"
64 #include "bus-internal.h"
65 #include "bus-message.h"
66
67 #ifndef TTY_GID
68 #define TTY_GID 5
69 #endif
70
71 typedef enum LinkJournal {
72         LINK_NO,
73         LINK_AUTO,
74         LINK_HOST,
75         LINK_GUEST
76 } LinkJournal;
77
78 static char *arg_directory = NULL;
79 static char *arg_user = NULL;
80 static sd_id128_t arg_uuid = {};
81 static char *arg_machine = NULL;
82 static const char *arg_slice = NULL;
83 static bool arg_private_network = false;
84 static bool arg_read_only = false;
85 static bool arg_boot = false;
86 static LinkJournal arg_link_journal = LINK_AUTO;
87 static uint64_t arg_retain =
88         (1ULL << CAP_CHOWN) |
89         (1ULL << CAP_DAC_OVERRIDE) |
90         (1ULL << CAP_DAC_READ_SEARCH) |
91         (1ULL << CAP_FOWNER) |
92         (1ULL << CAP_FSETID) |
93         (1ULL << CAP_IPC_OWNER) |
94         (1ULL << CAP_KILL) |
95         (1ULL << CAP_LEASE) |
96         (1ULL << CAP_LINUX_IMMUTABLE) |
97         (1ULL << CAP_NET_BIND_SERVICE) |
98         (1ULL << CAP_NET_BROADCAST) |
99         (1ULL << CAP_NET_RAW) |
100         (1ULL << CAP_SETGID) |
101         (1ULL << CAP_SETFCAP) |
102         (1ULL << CAP_SETPCAP) |
103         (1ULL << CAP_SETUID) |
104         (1ULL << CAP_SYS_ADMIN) |
105         (1ULL << CAP_SYS_CHROOT) |
106         (1ULL << CAP_SYS_NICE) |
107         (1ULL << CAP_SYS_PTRACE) |
108         (1ULL << CAP_SYS_TTY_CONFIG) |
109         (1ULL << CAP_SYS_RESOURCE) |
110         (1ULL << CAP_SYS_BOOT) |
111         (1ULL << CAP_AUDIT_WRITE) |
112         (1ULL << CAP_AUDIT_CONTROL);
113 static char **arg_bind = NULL;
114 static char **arg_bind_ro = NULL;
115
116 static int help(void) {
117
118         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
119                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
120                "  -h --help                Show this help\n"
121                "     --version             Print version string\n"
122                "  -D --directory=NAME      Root directory for the container\n"
123                "  -b --boot                Boot up full system (i.e. invoke init)\n"
124                "  -u --user=USER           Run the command under specified user or uid\n"
125                "     --uuid=UUID           Set a specific machine UUID for the container\n"
126                "  -M --machine=NAME        Set the machine name for the container\n"
127                "  -S --slice=SLICE         Place the container in the specified slice\n"
128                "     --private-network     Disable network in container\n"
129                "     --read-only           Mount the root directory read-only\n"
130                "     --capability=CAP      In addition to the default, retain specified\n"
131                "                           capability\n"
132                "     --link-journal=MODE   Link up guest journal, one of no, auto, guest, host\n"
133                "  -j                       Equivalent to --link-journal=host\n"
134                "     --bind=PATH[:PATH]    Bind mount a file or directory from the host into\n"
135                "                           the container\n"
136                "     --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n",
137                program_invocation_short_name);
138
139         return 0;
140 }
141
142 static int parse_argv(int argc, char *argv[]) {
143
144         enum {
145                 ARG_VERSION = 0x100,
146                 ARG_PRIVATE_NETWORK,
147                 ARG_UUID,
148                 ARG_READ_ONLY,
149                 ARG_CAPABILITY,
150                 ARG_LINK_JOURNAL,
151                 ARG_BIND,
152                 ARG_BIND_RO
153         };
154
155         static const struct option options[] = {
156                 { "help",            no_argument,       NULL, 'h'                 },
157                 { "version",         no_argument,       NULL, ARG_VERSION         },
158                 { "directory",       required_argument, NULL, 'D'                 },
159                 { "user",            required_argument, NULL, 'u'                 },
160                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
161                 { "boot",            no_argument,       NULL, 'b'                 },
162                 { "uuid",            required_argument, NULL, ARG_UUID            },
163                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
164                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
165                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
166                 { "bind",            required_argument, NULL, ARG_BIND            },
167                 { "bind-ro",         required_argument, NULL, ARG_BIND_RO         },
168                 { "machine",         required_argument, NULL, 'M'                 },
169                 { "slice",           required_argument, NULL, 'S'                 },
170                 { NULL,              0,                 NULL, 0                   }
171         };
172
173         int c, r;
174
175         assert(argc >= 0);
176         assert(argv);
177
178         while ((c = getopt_long(argc, argv, "+hD:u:bM:jS:", options, NULL)) >= 0) {
179
180                 switch (c) {
181
182                 case 'h':
183                         help();
184                         return 0;
185
186                 case ARG_VERSION:
187                         puts(PACKAGE_STRING);
188                         puts(SYSTEMD_FEATURES);
189                         return 0;
190
191                 case 'D':
192                         free(arg_directory);
193                         arg_directory = canonicalize_file_name(optarg);
194                         if (!arg_directory) {
195                                 log_error("Failed to canonicalize root directory.");
196                                 return -ENOMEM;
197                         }
198
199                         break;
200
201                 case 'u':
202                         free(arg_user);
203                         arg_user = strdup(optarg);
204                         if (!arg_user)
205                                 return log_oom();
206
207                         break;
208
209                 case ARG_PRIVATE_NETWORK:
210                         arg_private_network = true;
211                         break;
212
213                 case 'b':
214                         arg_boot = true;
215                         break;
216
217                 case ARG_UUID:
218                         r = sd_id128_from_string(optarg, &arg_uuid);
219                         if (r < 0) {
220                                 log_error("Invalid UUID: %s", optarg);
221                                 return r;
222                         }
223                         break;
224
225                 case 'S':
226                         arg_slice = strdup(optarg);
227                         break;
228
229                 case 'M':
230                         if (!hostname_is_valid(optarg)) {
231                                 log_error("Invalid machine name: %s", optarg);
232                                 return -EINVAL;
233                         }
234
235                         free(arg_machine);
236                         arg_machine = strdup(optarg);
237                         if (!arg_machine)
238                                 return log_oom();
239
240                         break;
241
242                 case ARG_READ_ONLY:
243                         arg_read_only = true;
244                         break;
245
246                 case ARG_CAPABILITY: {
247                         char *state, *word;
248                         size_t length;
249
250                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
251                                 cap_value_t cap;
252                                 char *t;
253
254                                 t = strndup(word, length);
255                                 if (!t)
256                                         return log_oom();
257
258                                 if (cap_from_name(t, &cap) < 0) {
259                                         log_error("Failed to parse capability %s.", t);
260                                         free(t);
261                                         return -EINVAL;
262                                 }
263
264                                 free(t);
265                                 arg_retain |= 1ULL << (uint64_t) cap;
266                         }
267
268                         break;
269                 }
270
271                 case 'j':
272                         arg_link_journal = LINK_GUEST;
273                         break;
274
275                 case ARG_LINK_JOURNAL:
276                         if (streq(optarg, "auto"))
277                                 arg_link_journal = LINK_AUTO;
278                         else if (streq(optarg, "no"))
279                                 arg_link_journal = LINK_NO;
280                         else if (streq(optarg, "guest"))
281                                 arg_link_journal = LINK_GUEST;
282                         else if (streq(optarg, "host"))
283                                 arg_link_journal = LINK_HOST;
284                         else {
285                                 log_error("Failed to parse link journal mode %s", optarg);
286                                 return -EINVAL;
287                         }
288
289                         break;
290
291                 case ARG_BIND:
292                 case ARG_BIND_RO: {
293                         _cleanup_free_ char *a = NULL, *b = NULL;
294                         char *e;
295                         char ***x;
296
297                         x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
298
299                         e = strchr(optarg, ':');
300                         if (e) {
301                                 a = strndup(optarg, e - optarg);
302                                 b = strdup(e + 1);
303                         } else {
304                                 a = strdup(optarg);
305                                 b = strdup(optarg);
306                         }
307
308                         if (!a || !b)
309                                 return log_oom();
310
311                         if (!path_is_absolute(a) || !path_is_absolute(b)) {
312                                 log_error("Invalid bind mount specification: %s", optarg);
313                                 return -EINVAL;
314                         }
315
316                         r = strv_extend(x, a);
317                         if (r < 0)
318                                 return r;
319
320                         r = strv_extend(x, b);
321                         if (r < 0)
322                                 return r;
323
324                         break;
325                 }
326
327                 case '?':
328                         return -EINVAL;
329
330                 default:
331                         log_error("Unknown option code %c", c);
332                         return -EINVAL;
333                 }
334         }
335
336         return 1;
337 }
338
339 static int mount_all(const char *dest) {
340
341         typedef struct MountPoint {
342                 const char *what;
343                 const char *where;
344                 const char *type;
345                 const char *options;
346                 unsigned long flags;
347                 bool fatal;
348         } MountPoint;
349
350         static const MountPoint mount_table[] = {
351                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
352                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
353                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
354                 { "sysfs",     "/sys",      "sysfs", NULL,       MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
355                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
356                 { "devpts",    "/dev/pts",  "devpts","newinstance,ptmxmode=0666,mode=620,gid=" STRINGIFY(TTY_GID), MS_NOSUID|MS_NOEXEC, true },
357                 { "tmpfs",     "/dev/shm",  "tmpfs", "mode=1777", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
358                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
359 #ifdef HAVE_SELINUX
360                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
361                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
362 #endif
363         };
364
365         unsigned k;
366         int r = 0;
367
368         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
369                 _cleanup_free_ char *where = NULL;
370                 int t;
371
372                 where = strjoin(dest, "/", mount_table[k].where, NULL);
373                 if (!where)
374                         return log_oom();
375
376                 t = path_is_mount_point(where, true);
377                 if (t < 0) {
378                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
379
380                         if (r == 0)
381                                 r = t;
382
383                         continue;
384                 }
385
386                 /* Skip this entry if it is not a remount. */
387                 if (mount_table[k].what && t > 0)
388                         continue;
389
390                 mkdir_p(where, 0755);
391
392                 if (mount(mount_table[k].what,
393                           where,
394                           mount_table[k].type,
395                           mount_table[k].flags,
396                           mount_table[k].options) < 0 &&
397                     mount_table[k].fatal) {
398
399                         log_error("mount(%s) failed: %m", where);
400
401                         if (r == 0)
402                                 r = -errno;
403                 }
404         }
405
406         return r;
407 }
408
409 static int mount_binds(const char *dest, char **l, unsigned long flags) {
410         char **x, **y;
411
412         STRV_FOREACH_PAIR(x, y, l) {
413                 _cleanup_free_ char *where = NULL;
414                 struct stat source_st, dest_st;
415
416                 if (stat(*x, &source_st) < 0) {
417                         log_error("failed to stat %s: %m", *x);
418                         return -errno;
419                 }
420
421                 where = strjoin(dest, "/", *y, NULL);
422                 if (!where)
423                         return log_oom();
424
425                 if (stat(where, &dest_st) == 0) {
426                         if ((source_st.st_mode & S_IFMT) != (dest_st.st_mode & S_IFMT)) {
427                                 log_error("The file types of %s and %s do not matching. Refusing bind mount",
428                                                 *x, where);
429                                 return -EINVAL;
430                         }
431                 } else {
432                         /* Create the mount point, but be conservative -- refuse to create block
433                          * and char devices. */
434                         if (S_ISDIR(source_st.st_mode))
435                                 mkdir_p_label(where, 0755);
436                         else if (S_ISFIFO(source_st.st_mode))
437                                 mkfifo(where, 0644);
438                         else if (S_ISSOCK(source_st.st_mode))
439                                 mknod(where, 0644 | S_IFSOCK, 0);
440                         else if (S_ISREG(source_st.st_mode))
441                                 touch(where);
442                         else {
443                                 log_error("Refusing to create mountpoint for file: %s", *x);
444                                 return -ENOTSUP;
445                         }
446                 }
447
448                 if (mount(*x, where, "bind", MS_BIND, NULL) < 0) {
449                         log_error("mount(%s) failed: %m", where);
450                         return -errno;
451                 }
452
453                 if (flags && mount(NULL, where, NULL, MS_REMOUNT|MS_BIND|flags, NULL) < 0) {
454                         log_error("mount(%s) failed: %m", where);
455                         return -errno;
456                 }
457         }
458
459         return 0;
460 }
461
462 static int setup_timezone(const char *dest) {
463         _cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
464         char *z, *y;
465         int r;
466
467         assert(dest);
468
469         /* Fix the timezone, if possible */
470         r = readlink_malloc("/etc/localtime", &p);
471         if (r < 0) {
472                 log_warning("/etc/localtime is not a symlink, not updating container timezone.");
473                 return 0;
474         }
475
476         z = path_startswith(p, "../usr/share/zoneinfo/");
477         if (!z)
478                 z = path_startswith(p, "/usr/share/zoneinfo/");
479         if (!z) {
480                 log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
481                 return 0;
482         }
483
484         where = strappend(dest, "/etc/localtime");
485         if (!where)
486                 return log_oom();
487
488         r = readlink_malloc(where, &q);
489         if (r >= 0) {
490                 y = path_startswith(q, "../usr/share/zoneinfo/");
491                 if (!y)
492                         y = path_startswith(q, "/usr/share/zoneinfo/");
493
494
495                 /* Already pointing to the right place? Then do nothing .. */
496                 if (y && streq(y, z))
497                         return 0;
498         }
499
500         check = strjoin(dest, "/usr/share/zoneinfo/", z, NULL);
501         if (!check)
502                 return log_oom();
503
504         if (access(check, F_OK) < 0) {
505                 log_warning("Timezone %s does not exist in container, not updating container timezone.", z);
506                 return 0;
507         }
508
509         what = strappend("../usr/share/zoneinfo/", z);
510         if (!what)
511                 return log_oom();
512
513         unlink(where);
514         if (symlink(what, where) < 0) {
515                 log_error("Failed to correct timezone of container: %m");
516                 return 0;
517         }
518
519         return 0;
520 }
521
522 static int setup_resolv_conf(const char *dest) {
523         char _cleanup_free_ *where = NULL;
524         _cleanup_close_ int fd = -1;
525
526         assert(dest);
527
528         if (arg_private_network)
529                 return 0;
530
531         /* Fix resolv.conf, if possible */
532         where = strappend(dest, "/etc/resolv.conf");
533         if (!where)
534                 return log_oom();
535
536         fd = open(where, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, 0644);
537
538         /* We don't really care for the results of this really. If it
539          * fails, it fails, but meh... */
540         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) < 0)
541                 log_warning("Failed to bind mount /etc/resolv.conf: %m");
542         else
543                 if (mount("/etc/resolv.conf", where, "bind",
544                           MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
545                         log_error("Failed to remount /etc/resolv.conf readonly: %m");
546                         return -errno;
547                 }
548
549         return 0;
550 }
551
552 static int setup_boot_id(const char *dest) {
553         _cleanup_free_ char *from = NULL, *to = NULL;
554         sd_id128_t rnd;
555         char as_uuid[37];
556         int r;
557
558         assert(dest);
559
560         /* Generate a new randomized boot ID, so that each boot-up of
561          * the container gets a new one */
562
563         from = strappend(dest, "/dev/proc-sys-kernel-random-boot-id");
564         to = strappend(dest, "/proc/sys/kernel/random/boot_id");
565         if (!from || !to)
566                 return log_oom();
567
568         r = sd_id128_randomize(&rnd);
569         if (r < 0) {
570                 log_error("Failed to generate random boot id: %s", strerror(-r));
571                 return r;
572         }
573
574         snprintf(as_uuid, sizeof(as_uuid),
575                  "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
576                  SD_ID128_FORMAT_VAL(rnd));
577         char_array_0(as_uuid);
578
579         r = write_string_file(from, as_uuid);
580         if (r < 0) {
581                 log_error("Failed to write boot id: %s", strerror(-r));
582                 return r;
583         }
584
585         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
586                 log_error("Failed to bind mount boot id: %m");
587                 r = -errno;
588         } else if (mount(from, to, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL))
589                 log_warning("Failed to make boot id read-only: %m");
590
591         unlink(from);
592         return r;
593 }
594
595 static int copy_devnodes(const char *dest) {
596
597         static const char devnodes[] =
598                 "null\0"
599                 "zero\0"
600                 "full\0"
601                 "random\0"
602                 "urandom\0"
603                 "tty\0";
604
605         const char *d;
606         int r = 0;
607         _cleanup_umask_ mode_t u;
608
609         assert(dest);
610
611         u = umask(0000);
612
613         NULSTR_FOREACH(d, devnodes) {
614                 struct stat st;
615                 _cleanup_free_ char *from = NULL, *to = NULL;
616
617                 asprintf(&from, "/dev/%s", d);
618                 asprintf(&to, "%s/dev/%s", dest, d);
619
620                 if (!from || !to) {
621                         log_oom();
622
623                         if (r == 0)
624                                 r = -ENOMEM;
625
626                         break;
627                 }
628
629                 if (stat(from, &st) < 0) {
630
631                         if (errno != ENOENT) {
632                                 log_error("Failed to stat %s: %m", from);
633                                 if (r == 0)
634                                         r = -errno;
635                         }
636
637                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
638
639                         log_error("%s is not a char or block device, cannot copy", from);
640                         if (r == 0)
641                                 r = -EIO;
642
643                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
644
645                         log_error("mknod(%s) failed: %m", dest);
646                         if (r == 0)
647                                 r = -errno;
648                 }
649         }
650
651         return r;
652 }
653
654 static int setup_ptmx(const char *dest) {
655         _cleanup_free_ char *p = NULL;
656
657         p = strappend(dest, "/dev/ptmx");
658         if (!p)
659                 return log_oom();
660
661         if (symlink("pts/ptmx", p) < 0) {
662                 log_error("Failed to create /dev/ptmx symlink: %m");
663                 return -errno;
664         }
665
666         return 0;
667 }
668
669 static int setup_dev_console(const char *dest, const char *console) {
670         struct stat st;
671         _cleanup_free_ char *to = NULL;
672         int r;
673         _cleanup_umask_ mode_t u;
674
675         assert(dest);
676         assert(console);
677
678         u = umask(0000);
679
680         if (stat(console, &st) < 0) {
681                 log_error("Failed to stat %s: %m", console);
682                 return -errno;
683
684         } else if (!S_ISCHR(st.st_mode)) {
685                 log_error("/dev/console is not a char device");
686                 return -EIO;
687         }
688
689         r = chmod_and_chown(console, 0600, 0, 0);
690         if (r < 0) {
691                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
692                 return r;
693         }
694
695         if (asprintf(&to, "%s/dev/console", dest) < 0)
696                 return log_oom();
697
698         /* We need to bind mount the right tty to /dev/console since
699          * ptys can only exist on pts file systems. To have something
700          * to bind mount things on we create a device node first, that
701          * has the right major/minor (note that the major minor
702          * doesn't actually matter here, since we mount it over
703          * anyway). */
704
705         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
706                 log_error("mknod() for /dev/console failed: %m");
707                 return -errno;
708         }
709
710         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
711                 log_error("Bind mount for /dev/console failed: %m");
712                 return -errno;
713         }
714
715         return 0;
716 }
717
718 static int setup_kmsg(const char *dest, int kmsg_socket) {
719         _cleanup_free_ char *from = NULL, *to = NULL;
720         int r, fd, k;
721         _cleanup_umask_ mode_t u;
722         union {
723                 struct cmsghdr cmsghdr;
724                 uint8_t buf[CMSG_SPACE(sizeof(int))];
725         } control = {};
726         struct msghdr mh = {
727                 .msg_control = &control,
728                 .msg_controllen = sizeof(control),
729         };
730         struct cmsghdr *cmsg;
731
732         assert(dest);
733         assert(kmsg_socket >= 0);
734
735         u = umask(0000);
736
737         /* We create the kmsg FIFO as /dev/kmsg, but immediately
738          * delete it after bind mounting it to /proc/kmsg. While FIFOs
739          * on the reading side behave very similar to /proc/kmsg,
740          * their writing side behaves differently from /dev/kmsg in
741          * that writing blocks when nothing is reading. In order to
742          * avoid any problems with containers deadlocking due to this
743          * we simply make /dev/kmsg unavailable to the container. */
744         if (asprintf(&from, "%s/dev/kmsg", dest) < 0 ||
745             asprintf(&to, "%s/proc/kmsg", dest) < 0)
746                 return log_oom();
747
748         if (mkfifo(from, 0600) < 0) {
749                 log_error("mkfifo() for /dev/kmsg failed: %m");
750                 return -errno;
751         }
752
753         r = chmod_and_chown(from, 0600, 0, 0);
754         if (r < 0) {
755                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
756                 return r;
757         }
758
759         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
760                 log_error("Bind mount for /proc/kmsg failed: %m");
761                 return -errno;
762         }
763
764         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
765         if (fd < 0) {
766                 log_error("Failed to open fifo: %m");
767                 return -errno;
768         }
769
770         cmsg = CMSG_FIRSTHDR(&mh);
771         cmsg->cmsg_level = SOL_SOCKET;
772         cmsg->cmsg_type = SCM_RIGHTS;
773         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
774         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
775
776         mh.msg_controllen = cmsg->cmsg_len;
777
778         /* Store away the fd in the socket, so that it stays open as
779          * long as we run the child */
780         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
781         close_nointr_nofail(fd);
782
783         if (k < 0) {
784                 log_error("Failed to send FIFO fd: %m");
785                 return -errno;
786         }
787
788         /* And now make the FIFO unavailable as /dev/kmsg... */
789         unlink(from);
790         return 0;
791 }
792
793 static int setup_hostname(void) {
794
795         if (sethostname(arg_machine, strlen(arg_machine)) < 0)
796                 return -errno;
797
798         return 0;
799 }
800
801 static int setup_journal(const char *directory) {
802         sd_id128_t machine_id;
803         _cleanup_free_ char *p = NULL, *b = NULL, *q = NULL, *d = NULL;
804         char *id;
805         int r;
806
807         if (arg_link_journal == LINK_NO)
808                 return 0;
809
810         p = strappend(directory, "/etc/machine-id");
811         if (!p)
812                 return log_oom();
813
814         r = read_one_line_file(p, &b);
815         if (r == -ENOENT && arg_link_journal == LINK_AUTO)
816                 return 0;
817         else if (r < 0) {
818                 log_error("Failed to read machine ID from %s: %s", p, strerror(-r));
819                 return r;
820         }
821
822         id = strstrip(b);
823         if (isempty(id) && arg_link_journal == LINK_AUTO)
824                 return 0;
825
826         /* Verify validity */
827         r = sd_id128_from_string(id, &machine_id);
828         if (r < 0) {
829                 log_error("Failed to parse machine ID from %s: %s", p, strerror(-r));
830                 return r;
831         }
832
833         free(p);
834         p = strappend("/var/log/journal/", id);
835         q = strjoin(directory, "/var/log/journal/", id, NULL);
836         if (!p || !q)
837                 return log_oom();
838
839         if (path_is_mount_point(p, false) > 0) {
840                 if (arg_link_journal != LINK_AUTO) {
841                         log_error("%s: already a mount point, refusing to use for journal", p);
842                         return -EEXIST;
843                 }
844
845                 return 0;
846         }
847
848         if (path_is_mount_point(q, false) > 0) {
849                 if (arg_link_journal != LINK_AUTO) {
850                         log_error("%s: already a mount point, refusing to use for journal", q);
851                         return -EEXIST;
852                 }
853
854                 return 0;
855         }
856
857         r = readlink_and_make_absolute(p, &d);
858         if (r >= 0) {
859                 if ((arg_link_journal == LINK_GUEST ||
860                      arg_link_journal == LINK_AUTO) &&
861                     path_equal(d, q)) {
862
863                         r = mkdir_p(q, 0755);
864                         if (r < 0)
865                                 log_warning("failed to create directory %s: %m", q);
866                         return 0;
867                 }
868
869                 if (unlink(p) < 0) {
870                         log_error("Failed to remove symlink %s: %m", p);
871                         return -errno;
872                 }
873         } else if (r == -EINVAL) {
874
875                 if (arg_link_journal == LINK_GUEST &&
876                     rmdir(p) < 0) {
877
878                         if (errno == ENOTDIR) {
879                                 log_error("%s already exists and is neither a symlink nor a directory", p);
880                                 return r;
881                         } else {
882                                 log_error("Failed to remove %s: %m", p);
883                                 return -errno;
884                         }
885                 }
886         } else if (r != -ENOENT) {
887                 log_error("readlink(%s) failed: %m", p);
888                 return r;
889         }
890
891         if (arg_link_journal == LINK_GUEST) {
892
893                 if (symlink(q, p) < 0) {
894                         log_error("Failed to symlink %s to %s: %m", q, p);
895                         return -errno;
896                 }
897
898                 r = mkdir_p(q, 0755);
899                 if (r < 0)
900                         log_warning("failed to create directory %s: %m", q);
901                 return 0;
902         }
903
904         if (arg_link_journal == LINK_HOST) {
905                 r = mkdir_p(p, 0755);
906                 if (r < 0) {
907                         log_error("Failed to create %s: %m", p);
908                         return r;
909                 }
910
911         } else if (access(p, F_OK) < 0)
912                 return 0;
913
914         if (dir_is_empty(q) == 0) {
915                 log_error("%s not empty.", q);
916                 return -ENOTEMPTY;
917         }
918
919         r = mkdir_p(q, 0755);
920         if (r < 0) {
921                 log_error("Failed to create %s: %m", q);
922                 return r;
923         }
924
925         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
926                 log_error("Failed to bind mount journal from host into guest: %m");
927                 return -errno;
928         }
929
930         return 0;
931 }
932
933 static int drop_capabilities(void) {
934         return capability_bounding_set_drop(~arg_retain, false);
935 }
936
937 static int process_pty(int master, pid_t pid, sigset_t *mask) {
938
939         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
940         size_t in_buffer_full = 0, out_buffer_full = 0;
941         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
942         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
943         int ep = -1, signal_fd = -1, r;
944         bool tried_orderly_shutdown = false;
945
946         assert(master >= 0);
947         assert(pid > 0);
948         assert(mask);
949
950         fd_nonblock(STDIN_FILENO, 1);
951         fd_nonblock(STDOUT_FILENO, 1);
952         fd_nonblock(master, 1);
953
954         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
955         if (signal_fd < 0) {
956                 log_error("signalfd(): %m");
957                 r = -errno;
958                 goto finish;
959         }
960
961         ep = epoll_create1(EPOLL_CLOEXEC);
962         if (ep < 0) {
963                 log_error("Failed to create epoll: %m");
964                 r = -errno;
965                 goto finish;
966         }
967
968         /* We read from STDIN only if this is actually a TTY,
969          * otherwise we assume non-interactivity. */
970         if (isatty(STDIN_FILENO)) {
971                 zero(stdin_ev);
972                 stdin_ev.events = EPOLLIN|EPOLLET;
973                 stdin_ev.data.fd = STDIN_FILENO;
974
975                 if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0) {
976                         log_error("Failed to register STDIN in epoll: %m");
977                         r = -errno;
978                         goto finish;
979                 }
980         }
981
982         zero(stdout_ev);
983         stdout_ev.events = EPOLLOUT|EPOLLET;
984         stdout_ev.data.fd = STDOUT_FILENO;
985
986         zero(master_ev);
987         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
988         master_ev.data.fd = master;
989
990         zero(signal_ev);
991         signal_ev.events = EPOLLIN;
992         signal_ev.data.fd = signal_fd;
993
994         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0) {
995                 if (errno != EPERM) {
996                         log_error("Failed to register stdout in epoll: %m");
997                         r = -errno;
998                         goto finish;
999                 }
1000                 /* stdout without epoll support. Likely redirected to regular file. */
1001                 stdout_writable = true;
1002         }
1003
1004         if (epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
1005             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
1006                 log_error("Failed to register fds in epoll: %m");
1007                 r = -errno;
1008                 goto finish;
1009         }
1010
1011         for (;;) {
1012                 struct epoll_event ev[16];
1013                 ssize_t k;
1014                 int i, nfds;
1015
1016                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
1017                 if (nfds < 0) {
1018
1019                         if (errno == EINTR || errno == EAGAIN)
1020                                 continue;
1021
1022                         log_error("epoll_wait(): %m");
1023                         r = -errno;
1024                         goto finish;
1025                 }
1026
1027                 assert(nfds >= 1);
1028
1029                 for (i = 0; i < nfds; i++) {
1030                         if (ev[i].data.fd == STDIN_FILENO) {
1031
1032                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1033                                         stdin_readable = true;
1034
1035                         } else if (ev[i].data.fd == STDOUT_FILENO) {
1036
1037                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1038                                         stdout_writable = true;
1039
1040                         } else if (ev[i].data.fd == master) {
1041
1042                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
1043                                         master_readable = true;
1044
1045                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
1046                                         master_writable = true;
1047
1048                         } else if (ev[i].data.fd == signal_fd) {
1049                                 struct signalfd_siginfo sfsi;
1050                                 ssize_t n;
1051
1052                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
1053                                 if (n != sizeof(sfsi)) {
1054
1055                                         if (n >= 0) {
1056                                                 log_error("Failed to read from signalfd: invalid block size");
1057                                                 r = -EIO;
1058                                                 goto finish;
1059                                         }
1060
1061                                         if (errno != EINTR && errno != EAGAIN) {
1062                                                 log_error("Failed to read from signalfd: %m");
1063                                                 r = -errno;
1064                                                 goto finish;
1065                                         }
1066                                 } else {
1067
1068                                         if (sfsi.ssi_signo == SIGWINCH) {
1069                                                 struct winsize ws;
1070
1071                                                 /* The window size changed, let's forward that. */
1072                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1073                                                         ioctl(master, TIOCSWINSZ, &ws);
1074                                         } else if (sfsi.ssi_signo == SIGTERM && arg_boot && !tried_orderly_shutdown) {
1075
1076                                                 log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
1077
1078                                                 /* This only works for systemd... */
1079                                                 tried_orderly_shutdown = true;
1080                                                 kill(pid, SIGRTMIN+3);
1081
1082                                         } else {
1083                                                 r = 0;
1084                                                 goto finish;
1085                                         }
1086                                 }
1087                         }
1088                 }
1089
1090                 while ((stdin_readable && in_buffer_full <= 0) ||
1091                        (master_writable && in_buffer_full > 0) ||
1092                        (master_readable && out_buffer_full <= 0) ||
1093                        (stdout_writable && out_buffer_full > 0)) {
1094
1095                         if (stdin_readable && in_buffer_full < LINE_MAX) {
1096
1097                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
1098                                 if (k < 0) {
1099
1100                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1101                                                 stdin_readable = false;
1102                                         else {
1103                                                 log_error("read(): %m");
1104                                                 r = -errno;
1105                                                 goto finish;
1106                                         }
1107                                 } else
1108                                         in_buffer_full += (size_t) k;
1109                         }
1110
1111                         if (master_writable && in_buffer_full > 0) {
1112
1113                                 k = write(master, in_buffer, in_buffer_full);
1114                                 if (k < 0) {
1115
1116                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1117                                                 master_writable = false;
1118                                         else {
1119                                                 log_error("write(): %m");
1120                                                 r = -errno;
1121                                                 goto finish;
1122                                         }
1123
1124                                 } else {
1125                                         assert(in_buffer_full >= (size_t) k);
1126                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
1127                                         in_buffer_full -= k;
1128                                 }
1129                         }
1130
1131                         if (master_readable && out_buffer_full < LINE_MAX) {
1132
1133                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
1134                                 if (k < 0) {
1135
1136                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1137                                                 master_readable = false;
1138                                         else {
1139                                                 log_error("read(): %m");
1140                                                 r = -errno;
1141                                                 goto finish;
1142                                         }
1143                                 }  else
1144                                         out_buffer_full += (size_t) k;
1145                         }
1146
1147                         if (stdout_writable && out_buffer_full > 0) {
1148
1149                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
1150                                 if (k < 0) {
1151
1152                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
1153                                                 stdout_writable = false;
1154                                         else {
1155                                                 log_error("write(): %m");
1156                                                 r = -errno;
1157                                                 goto finish;
1158                                         }
1159
1160                                 } else {
1161                                         assert(out_buffer_full >= (size_t) k);
1162                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
1163                                         out_buffer_full -= k;
1164                                 }
1165                         }
1166                 }
1167         }
1168
1169 finish:
1170         if (ep >= 0)
1171                 close_nointr_nofail(ep);
1172
1173         if (signal_fd >= 0)
1174                 close_nointr_nofail(signal_fd);
1175
1176         return r;
1177 }
1178
1179 static int register_machine(void) {
1180         _cleanup_bus_error_free_ sd_bus_error error = SD_BUS_ERROR_NULL;
1181         _cleanup_bus_unref_ sd_bus *bus = NULL;
1182         int r;
1183
1184         r = sd_bus_open_system(&bus);
1185         if (r < 0) {
1186                 log_error("Failed to open system bus: %s", strerror(-r));
1187                 return r;
1188         }
1189
1190         r = sd_bus_call_method(
1191                         bus,
1192                         "org.freedesktop.machine1",
1193                         "/org/freedesktop/machine1",
1194                         "org.freedesktop.machine1.Manager",
1195                         "CreateMachine",
1196                         &error,
1197                         NULL,
1198                         "sayssusa(sv)",
1199                         arg_machine,
1200                         SD_BUS_APPEND_ID128(arg_uuid),
1201                         "nspawn",
1202                         "container",
1203                         (uint32_t) 0,
1204                         strempty(arg_directory),
1205                         1, "Slice", "s", strempty(arg_slice));
1206         if (r < 0) {
1207                 log_error("Failed to register machine: %s", error.message ? error.message : strerror(-r));
1208                 return r;
1209         }
1210
1211         return 0;
1212 }
1213
1214 static bool audit_enabled(void) {
1215         int fd;
1216
1217         fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_AUDIT);
1218         if (fd >= 0) {
1219                 close_nointr_nofail(fd);
1220                 return true;
1221         }
1222         return false;
1223 }
1224
1225 int main(int argc, char *argv[]) {
1226         pid_t pid = 0;
1227         int r = EXIT_FAILURE, k;
1228         _cleanup_close_ int master = -1;
1229         int n_fd_passed;
1230         const char *console = NULL;
1231         struct termios saved_attr, raw_attr;
1232         sigset_t mask;
1233         bool saved_attr_valid = false;
1234         struct winsize ws;
1235         int kmsg_socket_pair[2] = { -1, -1 };
1236         FDSet *fds = NULL;
1237
1238         log_parse_environment();
1239         log_open();
1240
1241         k = parse_argv(argc, argv);
1242         if (k < 0)
1243                 goto finish;
1244         else if (k == 0) {
1245                 r = EXIT_SUCCESS;
1246                 goto finish;
1247         }
1248
1249         if (arg_directory) {
1250                 char *p;
1251
1252                 p = path_make_absolute_cwd(arg_directory);
1253                 free(arg_directory);
1254                 arg_directory = p;
1255         } else
1256                 arg_directory = get_current_dir_name();
1257
1258         if (!arg_directory) {
1259                 log_error("Failed to determine path, please use -D.");
1260                 goto finish;
1261         }
1262
1263         path_kill_slashes(arg_directory);
1264
1265         if (!arg_machine) {
1266                 arg_machine = strdup(path_get_file_name(arg_directory));
1267                 if (!arg_machine) {
1268                         log_oom();
1269                         goto finish;
1270                 }
1271
1272                 hostname_cleanup(arg_machine, false);
1273                 if (isempty(arg_machine)) {
1274                         log_error("Failed to determine machine name automatically, please use -M.");
1275                         goto finish;
1276                 }
1277         }
1278
1279         if (geteuid() != 0) {
1280                 log_error("Need to be root.");
1281                 goto finish;
1282         }
1283
1284         if (sd_booted() <= 0) {
1285                 log_error("Not running on a systemd system.");
1286                 goto finish;
1287         }
1288
1289         if (arg_boot && audit_enabled()) {
1290                 log_warning("The kernel auditing subsystem is known to be incompatible with containers.\n"
1291                             "Please make sure to turn off auditing with 'audit=0' on the kernel command\n"
1292                             "line before using systemd-nspawn. Sleeping for 5s...\n");
1293                 sleep(5);
1294         }
1295
1296         if (path_equal(arg_directory, "/")) {
1297                 log_error("Spawning container on root directory not supported.");
1298                 goto finish;
1299         }
1300
1301         if (path_is_os_tree(arg_directory) <= 0) {
1302                 log_error("Directory %s doesn't look like an OS root directory (/etc/os-release is missing). Refusing.", arg_directory);
1303                 goto finish;
1304         }
1305
1306         log_close();
1307         n_fd_passed = sd_listen_fds(false);
1308         if (n_fd_passed > 0) {
1309                 k = fdset_new_listen_fds(&fds, false);
1310                 if (k < 0) {
1311                         log_error("Failed to collect file descriptors: %s", strerror(-k));
1312                         goto finish;
1313                 }
1314         }
1315         fdset_close_others(fds);
1316         log_open();
1317
1318         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1319         if (master < 0) {
1320                 log_error("Failed to acquire pseudo tty: %m");
1321                 goto finish;
1322         }
1323
1324         console = ptsname(master);
1325         if (!console) {
1326                 log_error("Failed to determine tty name: %m");
1327                 goto finish;
1328         }
1329
1330         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1331
1332         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1333                 ioctl(master, TIOCSWINSZ, &ws);
1334
1335         if (unlockpt(master) < 0) {
1336                 log_error("Failed to unlock tty: %m");
1337                 goto finish;
1338         }
1339
1340         if (tcgetattr(STDIN_FILENO, &saved_attr) >= 0) {
1341                 saved_attr_valid = true;
1342
1343                 raw_attr = saved_attr;
1344                 cfmakeraw(&raw_attr);
1345                 raw_attr.c_lflag &= ~ECHO;
1346         }
1347
1348         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1349                 log_error("Failed to create kmsg socket pair.");
1350                 goto finish;
1351         }
1352
1353         sd_notify(0, "READY=1");
1354
1355         assert_se(sigemptyset(&mask) == 0);
1356         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1357         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1358
1359         for (;;) {
1360                 siginfo_t status;
1361                 int pipefd[2], pipefd2[2];
1362
1363                 if (pipe2(pipefd, O_NONBLOCK|O_CLOEXEC) < 0) {
1364                         log_error("pipe2(): %m");
1365                         goto finish;
1366                 }
1367
1368                 if (pipe2(pipefd2, O_NONBLOCK|O_CLOEXEC) < 0) {
1369                         log_error("pipe2(): %m");
1370                         close_pipe(pipefd);
1371                         goto finish;
1372                 }
1373
1374                 pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1375                 if (pid < 0) {
1376                         if (errno == EINVAL)
1377                                 log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1378                         else
1379                                 log_error("clone() failed: %m");
1380
1381                         goto finish;
1382                 }
1383
1384                 if (pid == 0) {
1385                         /* child */
1386                         const char *home = NULL;
1387                         uid_t uid = (uid_t) -1;
1388                         gid_t gid = (gid_t) -1;
1389                         unsigned n_env = 2;
1390                         const char *envp[] = {
1391                                 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1392                                 "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1393                                 NULL, /* TERM */
1394                                 NULL, /* HOME */
1395                                 NULL, /* USER */
1396                                 NULL, /* LOGNAME */
1397                                 NULL, /* container_uuid */
1398                                 NULL, /* LISTEN_FDS */
1399                                 NULL, /* LISTEN_PID */
1400                                 NULL
1401                         };
1402
1403                         envp[n_env] = strv_find_prefix(environ, "TERM=");
1404                         if (envp[n_env])
1405                                 n_env ++;
1406
1407                         /* Wait for the parent process to log our PID */
1408                         close_nointr_nofail(pipefd[1]);
1409                         fd_wait_for_event(pipefd[0], POLLHUP, -1);
1410                         close_nointr_nofail(pipefd[0]);
1411
1412                         close_nointr_nofail(master);
1413                         master = -1;
1414
1415                         if (saved_attr_valid) {
1416                                 if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1417                                         log_error("Failed to set terminal attributes: %m");
1418                                         goto child_fail;
1419                                 }
1420                         }
1421
1422                         close_nointr(STDIN_FILENO);
1423                         close_nointr(STDOUT_FILENO);
1424                         close_nointr(STDERR_FILENO);
1425
1426                         close_nointr_nofail(kmsg_socket_pair[0]);
1427                         kmsg_socket_pair[0] = -1;
1428
1429                         reset_all_signal_handlers();
1430
1431                         assert_se(sigemptyset(&mask) == 0);
1432                         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1433
1434                         k = open_terminal(console, O_RDWR);
1435                         if (k != STDIN_FILENO) {
1436                                 if (k >= 0) {
1437                                         close_nointr_nofail(k);
1438                                         k = -EINVAL;
1439                                 }
1440
1441                                 log_error("Failed to open console: %s", strerror(-k));
1442                                 goto child_fail;
1443                         }
1444
1445                         if (dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1446                             dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO) {
1447                                 log_error("Failed to duplicate console: %m");
1448                                 goto child_fail;
1449                         }
1450
1451                         if (setsid() < 0) {
1452                                 log_error("setsid() failed: %m");
1453                                 goto child_fail;
1454                         }
1455
1456                         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1457                                 log_error("PR_SET_PDEATHSIG failed: %m");
1458                                 goto child_fail;
1459                         }
1460
1461                         close_pipe(pipefd2);
1462
1463                         r = register_machine();
1464                         if (r < 0)
1465                                 goto finish;
1466
1467                         /* Mark everything as slave, so that we still
1468                          * receive mounts from the real root, but don't
1469                          * propagate mounts to the real root. */
1470                         if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1471                                 log_error("MS_SLAVE|MS_REC failed: %m");
1472                                 goto child_fail;
1473                         }
1474
1475                         /* Turn directory into bind mount */
1476                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1477                                 log_error("Failed to make bind mount.");
1478                                 goto child_fail;
1479                         }
1480
1481                         if (arg_read_only)
1482                                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1483                                         log_error("Failed to make read-only.");
1484                                         goto child_fail;
1485                                 }
1486
1487                         if (mount_all(arg_directory) < 0)
1488                                 goto child_fail;
1489
1490                         if (copy_devnodes(arg_directory) < 0)
1491                                 goto child_fail;
1492
1493                         if (setup_ptmx(arg_directory) < 0)
1494                                 goto child_fail;
1495
1496                         dev_setup(arg_directory);
1497
1498                         if (setup_dev_console(arg_directory, console) < 0)
1499                                 goto child_fail;
1500
1501                         if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1502                                 goto child_fail;
1503
1504                         close_nointr_nofail(kmsg_socket_pair[1]);
1505                         kmsg_socket_pair[1] = -1;
1506
1507                         if (setup_boot_id(arg_directory) < 0)
1508                                 goto child_fail;
1509
1510                         if (setup_timezone(arg_directory) < 0)
1511                                 goto child_fail;
1512
1513                         if (setup_resolv_conf(arg_directory) < 0)
1514                                 goto child_fail;
1515
1516                         if (setup_journal(arg_directory) < 0)
1517                                 goto child_fail;
1518
1519                         if (mount_binds(arg_directory, arg_bind, 0) < 0)
1520                                 goto child_fail;
1521
1522                         if (mount_binds(arg_directory, arg_bind_ro, MS_RDONLY) < 0)
1523                                 goto child_fail;
1524
1525                         if (chdir(arg_directory) < 0) {
1526                                 log_error("chdir(%s) failed: %m", arg_directory);
1527                                 goto child_fail;
1528                         }
1529
1530                         if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1531                                 log_error("mount(MS_MOVE) failed: %m");
1532                                 goto child_fail;
1533                         }
1534
1535                         if (chroot(".") < 0) {
1536                                 log_error("chroot() failed: %m");
1537                                 goto child_fail;
1538                         }
1539
1540                         if (chdir("/") < 0) {
1541                                 log_error("chdir() failed: %m");
1542                                 goto child_fail;
1543                         }
1544
1545                         umask(0022);
1546
1547                         loopback_setup();
1548
1549                         if (drop_capabilities() < 0) {
1550                                 log_error("drop_capabilities() failed: %m");
1551                                 goto child_fail;
1552                         }
1553
1554                         if (arg_user) {
1555
1556                                 /* Note that this resolves user names
1557                                  * inside the container, and hence
1558                                  * accesses the NSS modules from the
1559                                  * container and not the host. This is
1560                                  * a bit weird... */
1561
1562                                 if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1563                                         log_error("get_user_creds() failed: %m");
1564                                         goto child_fail;
1565                                 }
1566
1567                                 if (mkdir_parents_label(home, 0775) < 0) {
1568                                         log_error("mkdir_parents_label() failed: %m");
1569                                         goto child_fail;
1570                                 }
1571
1572                                 if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1573                                         log_error("mkdir_safe_label() failed: %m");
1574                                         goto child_fail;
1575                                 }
1576
1577                                 if (initgroups((const char*)arg_user, gid) < 0) {
1578                                         log_error("initgroups() failed: %m");
1579                                         goto child_fail;
1580                                 }
1581
1582                                 if (setresgid(gid, gid, gid) < 0) {
1583                                         log_error("setregid() failed: %m");
1584                                         goto child_fail;
1585                                 }
1586
1587                                 if (setresuid(uid, uid, uid) < 0) {
1588                                         log_error("setreuid() failed: %m");
1589                                         goto child_fail;
1590                                 }
1591                         } else {
1592                                 /* Reset everything fully to 0, just in case */
1593
1594                                 if (setgroups(0, NULL) < 0) {
1595                                         log_error("setgroups() failed: %m");
1596                                         goto child_fail;
1597                                 }
1598
1599                                 if (setresgid(0, 0, 0) < 0) {
1600                                         log_error("setregid() failed: %m");
1601                                         goto child_fail;
1602                                 }
1603
1604                                 if (setresuid(0, 0, 0) < 0) {
1605                                         log_error("setreuid() failed: %m");
1606                                         goto child_fail;
1607                                 }
1608                         }
1609
1610                         if ((asprintf((char**)(envp + n_env++), "HOME=%s", home ? home: "/root") < 0) ||
1611                             (asprintf((char**)(envp + n_env++), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1612                             (asprintf((char**)(envp + n_env++), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1613                                 log_oom();
1614                                 goto child_fail;
1615                         }
1616
1617                         if (!sd_id128_equal(arg_uuid, SD_ID128_NULL)) {
1618                                 if (asprintf((char**)(envp + n_env++), "container_uuid=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid)) < 0) {
1619                                         log_oom();
1620                                         goto child_fail;
1621                                 }
1622                         }
1623
1624                         if (fdset_size(fds) > 0) {
1625                                 k = fdset_cloexec(fds, false);
1626                                 if (k < 0) {
1627                                         log_error("Failed to unset O_CLOEXEC for file descriptors.");
1628                                         goto child_fail;
1629                                 }
1630
1631                                 if ((asprintf((char **)(envp + n_env++), "LISTEN_FDS=%u", n_fd_passed) < 0) ||
1632                                     (asprintf((char **)(envp + n_env++), "LISTEN_PID=1") < 0)) {
1633                                         log_oom();
1634                                         goto child_fail;
1635                                 }
1636                         }
1637
1638                         setup_hostname();
1639
1640                         if (arg_boot) {
1641                                 char **a;
1642                                 size_t l;
1643
1644                                 /* Automatically search for the init system */
1645
1646                                 l = 1 + argc - optind;
1647                                 a = newa(char*, l + 1);
1648                                 memcpy(a + 1, argv + optind, l * sizeof(char*));
1649
1650                                 a[0] = (char*) "/usr/lib/systemd/systemd";
1651                                 execve(a[0], a, (char**) envp);
1652
1653                                 a[0] = (char*) "/lib/systemd/systemd";
1654                                 execve(a[0], a, (char**) envp);
1655
1656                                 a[0] = (char*) "/sbin/init";
1657                                 execve(a[0], a, (char**) envp);
1658                         } else if (argc > optind)
1659                                 execvpe(argv[optind], argv + optind, (char**) envp);
1660                         else {
1661                                 chdir(home ? home : "/root");
1662                                 execle("/bin/bash", "-bash", NULL, (char**) envp);
1663                         }
1664
1665                         log_error("execv() failed: %m");
1666
1667                 child_fail:
1668                         _exit(EXIT_FAILURE);
1669                 }
1670
1671                 log_info("Init process in the container running as PID %lu.", (unsigned long) pid);
1672                 close_nointr_nofail(pipefd[0]);
1673                 close_nointr_nofail(pipefd[1]);
1674
1675                 /* Wait for the child process to establish cgroup hierarchy */
1676                 close_nointr_nofail(pipefd2[1]);
1677                 fd_wait_for_event(pipefd2[0], POLLHUP, -1);
1678                 close_nointr_nofail(pipefd2[0]);
1679
1680                 fdset_free(fds);
1681                 fds = NULL;
1682
1683                 if (process_pty(master, pid, &mask) < 0)
1684                         goto finish;
1685
1686                 if (saved_attr_valid)
1687                         tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1688
1689                 k = wait_for_terminate(pid, &status);
1690                 if (k < 0) {
1691                         r = EXIT_FAILURE;
1692                         break;
1693                 }
1694
1695                 if (status.si_code == CLD_EXITED) {
1696                         r = status.si_status;
1697                         if (status.si_status != 0) {
1698                                 log_error("Container failed with error code %i.", status.si_status);
1699                                 break;
1700                         }
1701
1702                         log_debug("Container exited successfully.");
1703                         break;
1704                 } else if (status.si_code == CLD_KILLED &&
1705                            status.si_status == SIGINT) {
1706                         log_info("Container has been shut down.");
1707                         r = 0;
1708                         break;
1709                 } else if (status.si_code == CLD_KILLED &&
1710                            status.si_status == SIGHUP) {
1711                         log_info("Container is being rebooted.");
1712                         continue;
1713                 } else if (status.si_code == CLD_KILLED ||
1714                            status.si_code == CLD_DUMPED) {
1715
1716                         log_error("Container terminated by signal %s.", signal_to_string(status.si_status));
1717                         r = EXIT_FAILURE;
1718                         break;
1719                 } else {
1720                         log_error("Container failed due to unknown reason.");
1721                         r = EXIT_FAILURE;
1722                         break;
1723                 }
1724         }
1725
1726 finish:
1727         if (saved_attr_valid)
1728                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1729
1730         close_pipe(kmsg_socket_pair);
1731
1732         if (pid > 0)
1733                 kill(pid, SIGKILL);
1734
1735         free(arg_directory);
1736         free(arg_machine);
1737
1738         fdset_free(fds);
1739
1740         return r;
1741 }