chiark / gitweb /
nspawn: add /dev FD symlinks in container setup
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55 #include "sd-id128.h"
56 #include "dev-setup.h"
57
58 typedef enum LinkJournal {
59         LINK_NO,
60         LINK_AUTO,
61         LINK_HOST,
62         LINK_GUEST
63 } LinkJournal;
64
65 static char *arg_directory = NULL;
66 static char *arg_user = NULL;
67 static char **arg_controllers = NULL;
68 static char *arg_uuid = NULL;
69 static bool arg_private_network = false;
70 static bool arg_read_only = false;
71 static bool arg_boot = false;
72 static LinkJournal arg_link_journal = LINK_AUTO;
73 static uint64_t arg_retain =
74         (1ULL << CAP_CHOWN) |
75         (1ULL << CAP_DAC_OVERRIDE) |
76         (1ULL << CAP_DAC_READ_SEARCH) |
77         (1ULL << CAP_FOWNER) |
78         (1ULL << CAP_FSETID) |
79         (1ULL << CAP_IPC_OWNER) |
80         (1ULL << CAP_KILL) |
81         (1ULL << CAP_LEASE) |
82         (1ULL << CAP_LINUX_IMMUTABLE) |
83         (1ULL << CAP_NET_BIND_SERVICE) |
84         (1ULL << CAP_NET_BROADCAST) |
85         (1ULL << CAP_NET_RAW) |
86         (1ULL << CAP_SETGID) |
87         (1ULL << CAP_SETFCAP) |
88         (1ULL << CAP_SETPCAP) |
89         (1ULL << CAP_SETUID) |
90         (1ULL << CAP_SYS_ADMIN) |
91         (1ULL << CAP_SYS_CHROOT) |
92         (1ULL << CAP_SYS_NICE) |
93         (1ULL << CAP_SYS_PTRACE) |
94         (1ULL << CAP_SYS_TTY_CONFIG) |
95         (1ULL << CAP_SYS_RESOURCE);
96
97 static int help(void) {
98
99         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
100                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
101                "  -h --help               Show this help\n"
102                "  -D --directory=NAME     Root directory for the container\n"
103                "  -b --boot               Boot up full system (i.e. invoke init)\n"
104                "  -u --user=USER          Run the command under specified user or uid\n"
105                "  -C --controllers=LIST   Put the container in specified comma-separated cgroup hierarchies\n"
106                "     --uuid=UUID          Set a specific machine UUID for the container\n"
107                "     --private-network    Disable network in container\n"
108                "     --read-only          Mount the root directory read-only\n"
109                "     --capability=CAP     In addition to the default, retain specified capability\n"
110                "     --link-journal=MODE  Link up guest journal, one of no, auto, guest, host\n"
111                "  -j                      Equivalent to --link-journal=host\n",
112                program_invocation_short_name);
113
114         return 0;
115 }
116
117 static int parse_argv(int argc, char *argv[]) {
118
119         enum {
120                 ARG_PRIVATE_NETWORK = 0x100,
121                 ARG_UUID,
122                 ARG_READ_ONLY,
123                 ARG_CAPABILITY,
124                 ARG_LINK_JOURNAL
125         };
126
127         static const struct option options[] = {
128                 { "help",            no_argument,       NULL, 'h'                 },
129                 { "directory",       required_argument, NULL, 'D'                 },
130                 { "user",            required_argument, NULL, 'u'                 },
131                 { "controllers",     required_argument, NULL, 'C'                 },
132                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
133                 { "boot",            no_argument,       NULL, 'b'                 },
134                 { "uuid",            required_argument, NULL, ARG_UUID            },
135                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
136                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
137                 { "link-journal",    required_argument, NULL, ARG_LINK_JOURNAL    },
138                 { NULL,              0,                 NULL, 0                   }
139         };
140
141         int c;
142
143         assert(argc >= 0);
144         assert(argv);
145
146         while ((c = getopt_long(argc, argv, "+hD:u:C:bj", options, NULL)) >= 0) {
147
148                 switch (c) {
149
150                 case 'h':
151                         help();
152                         return 0;
153
154                 case 'D':
155                         free(arg_directory);
156                         arg_directory = canonicalize_file_name(optarg);
157                         if (!arg_directory) {
158                                 log_error("Failed to canonicalize root directory.");
159                                 return -ENOMEM;
160                         }
161
162                         break;
163
164                 case 'u':
165                         free(arg_user);
166                         if (!(arg_user = strdup(optarg))) {
167                                 log_error("Failed to duplicate user name.");
168                                 return -ENOMEM;
169                         }
170
171                         break;
172
173                 case 'C':
174                         strv_free(arg_controllers);
175                         arg_controllers = strv_split(optarg, ",");
176                         if (!arg_controllers) {
177                                 log_error("Failed to split controllers list.");
178                                 return -ENOMEM;
179                         }
180                         strv_uniq(arg_controllers);
181
182                         break;
183
184                 case ARG_PRIVATE_NETWORK:
185                         arg_private_network = true;
186                         break;
187
188                 case 'b':
189                         arg_boot = true;
190                         break;
191
192                 case ARG_UUID:
193                         arg_uuid = optarg;
194                         break;
195
196                 case ARG_READ_ONLY:
197                         arg_read_only = true;
198                         break;
199
200                 case ARG_CAPABILITY: {
201                         char *state, *word;
202                         size_t length;
203
204                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
205                                 cap_value_t cap;
206                                 char *t;
207
208                                 t = strndup(word, length);
209                                 if (!t)
210                                         return log_oom();
211
212                                 if (cap_from_name(t, &cap) < 0) {
213                                         log_error("Failed to parse capability %s.", t);
214                                         free(t);
215                                         return -EINVAL;
216                                 }
217
218                                 free(t);
219                                 arg_retain |= 1ULL << (uint64_t) cap;
220                         }
221
222                         break;
223                 }
224
225                 case 'j':
226                         arg_link_journal = LINK_GUEST;
227                         break;
228
229                 case ARG_LINK_JOURNAL:
230                         if (streq(optarg, "auto"))
231                                 arg_link_journal = LINK_AUTO;
232                         else if (streq(optarg, "no"))
233                                 arg_link_journal = LINK_NO;
234                         else if (streq(optarg, "guest"))
235                                 arg_link_journal = LINK_GUEST;
236                         else if (streq(optarg, "host"))
237                                 arg_link_journal = LINK_HOST;
238                         else {
239                                 log_error("Failed to parse link journal mode %s", optarg);
240                                 return -EINVAL;
241                         }
242
243                         break;
244
245                 case '?':
246                         return -EINVAL;
247
248                 default:
249                         log_error("Unknown option code %c", c);
250                         return -EINVAL;
251                 }
252         }
253
254         return 1;
255 }
256
257 static int mount_all(const char *dest) {
258
259         typedef struct MountPoint {
260                 const char *what;
261                 const char *where;
262                 const char *type;
263                 const char *options;
264                 unsigned long flags;
265                 bool fatal;
266         } MountPoint;
267
268         static const MountPoint mount_table[] = {
269                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
270                 { "/proc/sys", "/proc/sys", NULL,    NULL,       MS_BIND, true                       },   /* Bind mount first */
271                 { NULL,        "/proc/sys", NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
272                 { "/sys",      "/sys",      NULL,    NULL,       MS_BIND,                      true  },   /* Bind mount first */
273                 { NULL,        "/sys",      NULL,    NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
274                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
275                 { "/dev/pts",  "/dev/pts",  NULL,    NULL,       MS_BIND,                      true  },
276                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
277 #ifdef HAVE_SELINUX
278                 { "/sys/fs/selinux", "/sys/fs/selinux", NULL, NULL, MS_BIND,                      false },  /* Bind mount first */
279                 { NULL,              "/sys/fs/selinux", NULL, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
280 #endif
281         };
282
283         unsigned k;
284         int r = 0;
285         char *where;
286
287         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
288                 int t;
289
290                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
291                         log_oom();
292
293                         if (r == 0)
294                                 r = -ENOMEM;
295
296                         break;
297                 }
298
299                 t = path_is_mount_point(where, false);
300                 if (t < 0) {
301                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
302                         free(where);
303
304                         if (r == 0)
305                                 r = t;
306
307                         continue;
308                 }
309
310                 mkdir_p_label(where, 0755);
311
312                 if (mount(mount_table[k].what,
313                           where,
314                           mount_table[k].type,
315                           mount_table[k].flags,
316                           mount_table[k].options) < 0 &&
317                     mount_table[k].fatal) {
318
319                         log_error("mount(%s) failed: %m", where);
320
321                         if (r == 0)
322                                 r = -errno;
323                 }
324
325                 free(where);
326         }
327
328         return r;
329 }
330
331 static int setup_timezone(const char *dest) {
332         char *where;
333
334         assert(dest);
335
336         /* Fix the timezone, if possible */
337         if (asprintf(&where, "%s/etc/localtime", dest) < 0)
338                 return log_oom();
339
340         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
341                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
342
343         free(where);
344
345         if (asprintf(&where, "%s/etc/timezone", dest) < 0)
346                 return log_oom();
347
348         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
349                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
350
351         free(where);
352
353         return 0;
354 }
355
356 static int setup_resolv_conf(const char *dest) {
357         char *where;
358
359         assert(dest);
360
361         if (arg_private_network)
362                 return 0;
363
364         /* Fix resolv.conf, if possible */
365         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
366                 return log_oom();
367         }
368
369         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
370                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
371
372         free(where);
373
374         return 0;
375 }
376
377 static int copy_devnodes(const char *dest) {
378
379         static const char devnodes[] =
380                 "null\0"
381                 "zero\0"
382                 "full\0"
383                 "random\0"
384                 "urandom\0"
385                 "tty\0"
386                 "ptmx\0"
387                 "rtc0\0";
388
389         const char *d;
390         int r = 0;
391         mode_t u;
392
393         assert(dest);
394
395         u = umask(0000);
396
397         NULSTR_FOREACH(d, devnodes) {
398                 struct stat st;
399                 char *from = NULL, *to = NULL;
400
401                 asprintf(&from, "/dev/%s", d);
402                 asprintf(&to, "%s/dev/%s", dest, d);
403
404                 if (!from || !to) {
405                         log_error("Failed to allocate devnode path");
406
407                         free(from);
408                         free(to);
409
410                         from = to = NULL;
411
412                         if (r == 0)
413                                 r = -ENOMEM;
414
415                         break;
416                 }
417
418                 if (stat(from, &st) < 0) {
419
420                         if (errno != ENOENT) {
421                                 log_error("Failed to stat %s: %m", from);
422                                 if (r == 0)
423                                         r = -errno;
424                         }
425
426                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
427
428                         log_error("%s is not a char or block device, cannot copy.", from);
429                         if (r == 0)
430                                 r = -EIO;
431
432                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
433
434                         log_error("mknod(%s) failed: %m", dest);
435                         if (r == 0)
436                                 r = -errno;
437                 }
438
439                 free(from);
440                 free(to);
441         }
442
443         umask(u);
444
445         return r;
446 }
447
448 static int setup_dev_console(const char *dest, const char *console) {
449         struct stat st;
450         char *to = NULL;
451         int r;
452         mode_t u;
453
454         assert(dest);
455         assert(console);
456
457         u = umask(0000);
458
459         if (stat(console, &st) < 0) {
460                 log_error("Failed to stat %s: %m", console);
461                 r = -errno;
462                 goto finish;
463
464         } else if (!S_ISCHR(st.st_mode)) {
465                 log_error("/dev/console is not a char device.");
466                 r = -EIO;
467                 goto finish;
468         }
469
470         r = chmod_and_chown(console, 0600, 0, 0);
471         if (r < 0) {
472                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
473                 goto finish;
474         }
475
476         if (asprintf(&to, "%s/dev/console", dest) < 0) {
477                 r = log_oom();
478                 goto finish;
479         }
480
481         /* We need to bind mount the right tty to /dev/console since
482          * ptys can only exist on pts file systems. To have something
483          * to bind mount things on we create a device node first, that
484          * has the right major/minor (note that the major minor
485          * doesn't actually matter here, since we mount it over
486          * anyway). */
487
488         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
489                 log_error("mknod() for /dev/console failed: %m");
490                 r = -errno;
491                 goto finish;
492         }
493
494         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
495                 log_error("Bind mount for /dev/console failed: %m");
496                 r = -errno;
497                 goto finish;
498         }
499
500 finish:
501         free(to);
502         umask(u);
503
504         return r;
505 }
506
507 static int setup_kmsg(const char *dest, int kmsg_socket) {
508         char *from = NULL, *to = NULL;
509         int r, fd, k;
510         mode_t u;
511         union {
512                 struct cmsghdr cmsghdr;
513                 uint8_t buf[CMSG_SPACE(sizeof(int))];
514         } control;
515         struct msghdr mh;
516         struct cmsghdr *cmsg;
517
518         assert(dest);
519         assert(kmsg_socket >= 0);
520
521         u = umask(0000);
522
523         /* We create the kmsg FIFO as /dev/kmsg, but immediately
524          * delete it after bind mounting it to /proc/kmsg. While FIFOs
525          * on the reading side behave very similar to /proc/kmsg,
526          * their writing side behaves differently from /dev/kmsg in
527          * that writing blocks when nothing is reading. In order to
528          * avoid any problems with containers deadlocking due to this
529          * we simply make /dev/kmsg unavailable to the container. */
530         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
531                 r = log_oom();
532                 goto finish;
533         }
534
535         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
536                 r = log_oom();
537                 goto finish;
538         }
539
540         if (mkfifo(from, 0600) < 0) {
541                 log_error("mkfifo() for /dev/kmsg failed: %m");
542                 r = -errno;
543                 goto finish;
544         }
545
546         r = chmod_and_chown(from, 0600, 0, 0);
547         if (r < 0) {
548                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
549                 goto finish;
550         }
551
552         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
553                 log_error("Bind mount for /proc/kmsg failed: %m");
554                 r = -errno;
555                 goto finish;
556         }
557
558         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
559         if (fd < 0) {
560                 log_error("Failed to open fifo: %m");
561                 r = -errno;
562                 goto finish;
563         }
564
565         zero(mh);
566         zero(control);
567
568         mh.msg_control = &control;
569         mh.msg_controllen = sizeof(control);
570
571         cmsg = CMSG_FIRSTHDR(&mh);
572         cmsg->cmsg_level = SOL_SOCKET;
573         cmsg->cmsg_type = SCM_RIGHTS;
574         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
575         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
576
577         mh.msg_controllen = cmsg->cmsg_len;
578
579         /* Store away the fd in the socket, so that it stays open as
580          * long as we run the child */
581         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
582         close_nointr_nofail(fd);
583
584         if (k < 0) {
585                 log_error("Failed to send FIFO fd: %m");
586                 r = -errno;
587                 goto finish;
588         }
589
590         /* And now make the FIFO unavailable as /dev/kmsg... */
591         unlink(from);
592
593 finish:
594         free(from);
595         free(to);
596         umask(u);
597
598         return r;
599 }
600
601 static int setup_hostname(void) {
602         char *hn;
603         int r = 0;
604
605         hn = path_get_file_name(arg_directory);
606         if (hn) {
607                 hn = strdup(hn);
608                 if (!hn)
609                         return -ENOMEM;
610
611                 hostname_cleanup(hn);
612
613                 if (!isempty(hn))
614                         if (sethostname(hn, strlen(hn)) < 0)
615                                 r = -errno;
616
617                 free(hn);
618         }
619
620         return r;
621 }
622
623 static int setup_journal(const char *directory) {
624         sd_id128_t machine_id;
625         char *p = NULL, *b = NULL, *l, *q = NULL, *d = NULL;
626         int r;
627
628         if (arg_link_journal == LINK_NO)
629                 return 0;
630
631         p = strappend(directory, "/etc/machine-id");
632         if (!p) {
633                 r = log_oom();
634                 goto finish;
635         }
636
637         r = read_one_line_file(p, &b);
638         if (r == -ENOENT && arg_link_journal == LINK_AUTO) {
639                 r = 0;
640                 goto finish;
641         } else if (r < 0) {
642                 log_error("Failed to read machine ID: %s", strerror(-r));
643                 return r;
644         }
645
646         l = strstrip(b);
647         if (isempty(l) && arg_link_journal == LINK_AUTO) {
648                 r = 0;
649                 goto finish;
650         }
651
652         /* Verify validaty */
653         r = sd_id128_from_string(l, &machine_id);
654         if (r < 0) {
655                 log_error("Failed to parse machine ID: %s", strerror(-r));
656                 goto finish;
657         }
658
659         free(p);
660         p = strappend("/var/log/journal/", l);
661         q = strjoin(directory, "/var/log/journal/", l, NULL);
662         if (!p || !q) {
663                 r = log_oom();
664                 goto finish;
665         }
666
667         if (path_is_mount_point(p, false) > 0 ||
668             path_is_mount_point(q, false) > 0) {
669                 if (arg_link_journal != LINK_AUTO) {
670                         log_error("Journal already a mount point, refusing.");
671                         r = -EEXIST;
672                         goto finish;
673                 }
674
675                 r = 0;
676                 goto finish;
677         }
678
679         r = readlink_and_make_absolute(p, &d);
680         if (r >= 0) {
681                 if ((arg_link_journal == LINK_GUEST ||
682                      arg_link_journal == LINK_AUTO) &&
683                     path_equal(d, q)) {
684
685                         mkdir_p(q, 0755);
686
687                         r = 0;
688                         goto finish;
689                 }
690
691                 if (unlink(p) < 0) {
692                         log_error("Failed to remove symlink %s: %m", p);
693                         r = -errno;
694                         goto finish;
695                 }
696         } else if (r == -EINVAL) {
697
698                 if (arg_link_journal == LINK_GUEST &&
699                     rmdir(p) < 0) {
700
701                         if (errno == ENOTDIR)
702                                 log_error("%s already exists and is neither symlink nor directory.", p);
703                         else {
704                                 log_error("Failed to remove %s: %m", p);
705                                 r = -errno;
706                         }
707
708                         goto finish;
709                 }
710         } else if (r != -ENOENT) {
711                 log_error("readlink(%s) failed: %m", p);
712                 goto finish;
713         }
714
715         if (arg_link_journal == LINK_GUEST) {
716
717                 if (symlink(q, p) < 0) {
718                         log_error("Failed to symlink %s to %s: %m", q, p);
719                         r = -errno;
720                         goto finish;
721                 }
722
723                 mkdir_p(q, 0755);
724
725                 r = 0;
726                 goto finish;
727         }
728
729         if (arg_link_journal == LINK_HOST) {
730                 r = mkdir_p(p, 0755);
731                 if (r < 0) {
732                         log_error("Failed to create %s: %m", p);
733                         goto finish;
734                 }
735
736         } else if (access(p, F_OK) < 0) {
737                 r = 0;
738                 goto finish;
739         }
740
741         if (dir_is_empty(q) == 0) {
742                 log_error("%s not empty.", q);
743                 r = -ENOTEMPTY;
744                 goto finish;
745         }
746
747         r = mkdir_p(q, 0755);
748         if (r < 0) {
749                 log_error("Failed to create %s: %m", q);
750                 goto finish;
751         }
752
753         if (mount(p, q, "bind", MS_BIND, NULL) < 0) {
754                 log_error("Failed to bind mount journal from host into guest: %m");
755                 r = -errno;
756                 goto finish;
757         }
758
759         r = 0;
760
761 finish:
762         free(p);
763         free(q);
764         free(d);
765         free(b);
766         return r;
767
768 }
769
770 static int drop_capabilities(void) {
771         return capability_bounding_set_drop(~arg_retain, false);
772 }
773
774 static int is_os_tree(const char *path) {
775         int r;
776         char *p;
777         /* We use /bin/sh as flag file if something is an OS */
778
779         if (asprintf(&p, "%s/bin/sh", path) < 0)
780                 return -ENOMEM;
781
782         r = access(p, F_OK);
783         free(p);
784
785         return r < 0 ? 0 : 1;
786 }
787
788 static int process_pty(int master, sigset_t *mask) {
789
790         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
791         size_t in_buffer_full = 0, out_buffer_full = 0;
792         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
793         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
794         int ep = -1, signal_fd = -1, r;
795
796         fd_nonblock(STDIN_FILENO, 1);
797         fd_nonblock(STDOUT_FILENO, 1);
798         fd_nonblock(master, 1);
799
800         signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC);
801         if (signal_fd < 0) {
802                 log_error("signalfd(): %m");
803                 r = -errno;
804                 goto finish;
805         }
806
807         ep = epoll_create1(EPOLL_CLOEXEC);
808         if (ep < 0) {
809                 log_error("Failed to create epoll: %m");
810                 r = -errno;
811                 goto finish;
812         }
813
814         zero(stdin_ev);
815         stdin_ev.events = EPOLLIN|EPOLLET;
816         stdin_ev.data.fd = STDIN_FILENO;
817
818         zero(stdout_ev);
819         stdout_ev.events = EPOLLOUT|EPOLLET;
820         stdout_ev.data.fd = STDOUT_FILENO;
821
822         zero(master_ev);
823         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
824         master_ev.data.fd = master;
825
826         zero(signal_ev);
827         signal_ev.events = EPOLLIN;
828         signal_ev.data.fd = signal_fd;
829
830         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
831             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
832             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
833             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
834                 log_error("Failed to regiser fds in epoll: %m");
835                 r = -errno;
836                 goto finish;
837         }
838
839         for (;;) {
840                 struct epoll_event ev[16];
841                 ssize_t k;
842                 int i, nfds;
843
844                 nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1);
845                 if (nfds < 0) {
846
847                         if (errno == EINTR || errno == EAGAIN)
848                                 continue;
849
850                         log_error("epoll_wait(): %m");
851                         r = -errno;
852                         goto finish;
853                 }
854
855                 assert(nfds >= 1);
856
857                 for (i = 0; i < nfds; i++) {
858                         if (ev[i].data.fd == STDIN_FILENO) {
859
860                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
861                                         stdin_readable = true;
862
863                         } else if (ev[i].data.fd == STDOUT_FILENO) {
864
865                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
866                                         stdout_writable = true;
867
868                         } else if (ev[i].data.fd == master) {
869
870                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
871                                         master_readable = true;
872
873                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
874                                         master_writable = true;
875
876                         } else if (ev[i].data.fd == signal_fd) {
877                                 struct signalfd_siginfo sfsi;
878                                 ssize_t n;
879
880                                 n = read(signal_fd, &sfsi, sizeof(sfsi));
881                                 if (n != sizeof(sfsi)) {
882
883                                         if (n >= 0) {
884                                                 log_error("Failed to read from signalfd: invalid block size");
885                                                 r = -EIO;
886                                                 goto finish;
887                                         }
888
889                                         if (errno != EINTR && errno != EAGAIN) {
890                                                 log_error("Failed to read from signalfd: %m");
891                                                 r = -errno;
892                                                 goto finish;
893                                         }
894                                 } else {
895
896                                         if (sfsi.ssi_signo == SIGWINCH) {
897                                                 struct winsize ws;
898
899                                                 /* The window size changed, let's forward that. */
900                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
901                                                         ioctl(master, TIOCSWINSZ, &ws);
902                                         } else {
903                                                 r = 0;
904                                                 goto finish;
905                                         }
906                                 }
907                         }
908                 }
909
910                 while ((stdin_readable && in_buffer_full <= 0) ||
911                        (master_writable && in_buffer_full > 0) ||
912                        (master_readable && out_buffer_full <= 0) ||
913                        (stdout_writable && out_buffer_full > 0)) {
914
915                         if (stdin_readable && in_buffer_full < LINE_MAX) {
916
917                                 k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full);
918                                 if (k < 0) {
919
920                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
921                                                 stdin_readable = false;
922                                         else {
923                                                 log_error("read(): %m");
924                                                 r = -errno;
925                                                 goto finish;
926                                         }
927                                 } else
928                                         in_buffer_full += (size_t) k;
929                         }
930
931                         if (master_writable && in_buffer_full > 0) {
932
933                                 k = write(master, in_buffer, in_buffer_full);
934                                 if (k < 0) {
935
936                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
937                                                 master_writable = false;
938                                         else {
939                                                 log_error("write(): %m");
940                                                 r = -errno;
941                                                 goto finish;
942                                         }
943
944                                 } else {
945                                         assert(in_buffer_full >= (size_t) k);
946                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
947                                         in_buffer_full -= k;
948                                 }
949                         }
950
951                         if (master_readable && out_buffer_full < LINE_MAX) {
952
953                                 k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full);
954                                 if (k < 0) {
955
956                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
957                                                 master_readable = false;
958                                         else {
959                                                 log_error("read(): %m");
960                                                 r = -errno;
961                                                 goto finish;
962                                         }
963                                 }  else
964                                         out_buffer_full += (size_t) k;
965                         }
966
967                         if (stdout_writable && out_buffer_full > 0) {
968
969                                 k = write(STDOUT_FILENO, out_buffer, out_buffer_full);
970                                 if (k < 0) {
971
972                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
973                                                 stdout_writable = false;
974                                         else {
975                                                 log_error("write(): %m");
976                                                 r = -errno;
977                                                 goto finish;
978                                         }
979
980                                 } else {
981                                         assert(out_buffer_full >= (size_t) k);
982                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
983                                         out_buffer_full -= k;
984                                 }
985                         }
986                 }
987         }
988
989 finish:
990         if (ep >= 0)
991                 close_nointr_nofail(ep);
992
993         if (signal_fd >= 0)
994                 close_nointr_nofail(signal_fd);
995
996         return r;
997 }
998
999 int main(int argc, char *argv[]) {
1000         pid_t pid = 0;
1001         int r = EXIT_FAILURE, k;
1002         char *oldcg = NULL, *newcg = NULL;
1003         char **controller = NULL;
1004         int master = -1;
1005         const char *console = NULL;
1006         struct termios saved_attr, raw_attr;
1007         sigset_t mask;
1008         bool saved_attr_valid = false;
1009         struct winsize ws;
1010         int kmsg_socket_pair[2] = { -1, -1 };
1011
1012         log_parse_environment();
1013         log_open();
1014
1015         r = parse_argv(argc, argv);
1016         if (r <= 0)
1017                 goto finish;
1018
1019         if (arg_directory) {
1020                 char *p;
1021
1022                 p = path_make_absolute_cwd(arg_directory);
1023                 free(arg_directory);
1024                 arg_directory = p;
1025         } else
1026                 arg_directory = get_current_dir_name();
1027
1028         if (!arg_directory) {
1029                 log_error("Failed to determine path");
1030                 goto finish;
1031         }
1032
1033         path_kill_slashes(arg_directory);
1034
1035         if (geteuid() != 0) {
1036                 log_error("Need to be root.");
1037                 goto finish;
1038         }
1039
1040         if (sd_booted() <= 0) {
1041                 log_error("Not running on a systemd system.");
1042                 goto finish;
1043         }
1044
1045         if (path_equal(arg_directory, "/")) {
1046                 log_error("Spawning container on root directory not supported.");
1047                 goto finish;
1048         }
1049
1050         if (is_os_tree(arg_directory) <= 0) {
1051                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
1052                 goto finish;
1053         }
1054
1055         k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg);
1056         if (k < 0) {
1057                 log_error("Failed to determine current cgroup: %s", strerror(-k));
1058                 goto finish;
1059         }
1060
1061         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
1062                 log_error("Failed to allocate cgroup path.");
1063                 goto finish;
1064         }
1065
1066         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
1067         if (k < 0)  {
1068                 log_error("Failed to create cgroup: %s", strerror(-k));
1069                 goto finish;
1070         }
1071
1072         STRV_FOREACH(controller, arg_controllers) {
1073                 k = cg_create_and_attach(*controller, newcg, 0);
1074                 if (k < 0)
1075                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
1076         }
1077
1078         master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
1079         if (master < 0) {
1080                 log_error("Failed to acquire pseudo tty: %m");
1081                 goto finish;
1082         }
1083
1084         console = ptsname(master);
1085         if (!console) {
1086                 log_error("Failed to determine tty name: %m");
1087                 goto finish;
1088         }
1089
1090         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
1091
1092         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
1093                 ioctl(master, TIOCSWINSZ, &ws);
1094
1095         if (unlockpt(master) < 0) {
1096                 log_error("Failed to unlock tty: %m");
1097                 goto finish;
1098         }
1099
1100         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
1101                 log_error("Failed to get terminal attributes: %m");
1102                 goto finish;
1103         }
1104
1105         saved_attr_valid = true;
1106
1107         raw_attr = saved_attr;
1108         cfmakeraw(&raw_attr);
1109         raw_attr.c_lflag &= ~ECHO;
1110
1111         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
1112                 log_error("Failed to set terminal attributes: %m");
1113                 goto finish;
1114         }
1115
1116         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
1117                 log_error("Failed to create kmsg socket pair");
1118                 goto finish;
1119         }
1120
1121         assert_se(sigemptyset(&mask) == 0);
1122         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
1123         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
1124
1125         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
1126         if (pid < 0) {
1127                 if (errno == EINVAL)
1128                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
1129                 else
1130                         log_error("clone() failed: %m");
1131
1132                 goto finish;
1133         }
1134
1135         if (pid == 0) {
1136                 /* child */
1137
1138                 const char *home = NULL;
1139                 uid_t uid = (uid_t) -1;
1140                 gid_t gid = (gid_t) -1;
1141                 const char *envp[] = {
1142                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1143                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
1144                         NULL, /* TERM */
1145                         NULL, /* HOME */
1146                         NULL, /* USER */
1147                         NULL, /* LOGNAME */
1148                         NULL, /* container_uuid */
1149                         NULL
1150                 };
1151
1152                 envp[2] = strv_find_prefix(environ, "TERM=");
1153
1154                 close_nointr_nofail(master);
1155
1156                 close_nointr(STDIN_FILENO);
1157                 close_nointr(STDOUT_FILENO);
1158                 close_nointr(STDERR_FILENO);
1159
1160                 close_all_fds(&kmsg_socket_pair[1], 1);
1161
1162                 reset_all_signal_handlers();
1163
1164                 assert_se(sigemptyset(&mask) == 0);
1165                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
1166
1167                 if (open_terminal(console, O_RDWR) != STDIN_FILENO ||
1168                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1169                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1170                         goto child_fail;
1171
1172                 if (setsid() < 0) {
1173                         log_error("setsid() failed: %m");
1174                         goto child_fail;
1175                 }
1176
1177                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) {
1178                         log_error("PR_SET_PDEATHSIG failed: %m");
1179                         goto child_fail;
1180                 }
1181
1182                 /* Mark everything as slave, so that we still
1183                  * receive mounts from the real root, but don't
1184                  * propagate mounts to the real root. */
1185                 if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0) {
1186                         log_error("MS_SLAVE|MS_REC failed: %m");
1187                         goto child_fail;
1188                 }
1189
1190                 /* Turn directory into bind mount */
1191                 if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REC, NULL) < 0) {
1192                         log_error("Failed to make bind mount.");
1193                         goto child_fail;
1194                 }
1195
1196                 if (arg_read_only)
1197                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY|MS_REC, NULL) < 0) {
1198                                 log_error("Failed to make read-only.");
1199                                 goto child_fail;
1200                         }
1201
1202                 if (mount_all(arg_directory) < 0)
1203                         goto child_fail;
1204
1205                 if (copy_devnodes(arg_directory) < 0)
1206                         goto child_fail;
1207
1208                 dev_setup(arg_directory);
1209
1210                 if (setup_dev_console(arg_directory, console) < 0)
1211                         goto child_fail;
1212
1213                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1214                         goto child_fail;
1215
1216                 close_nointr_nofail(kmsg_socket_pair[1]);
1217
1218                 if (setup_timezone(arg_directory) < 0)
1219                         goto child_fail;
1220
1221                 if (setup_resolv_conf(arg_directory) < 0)
1222                         goto child_fail;
1223
1224                 if (setup_journal(arg_directory) < 0)
1225                         goto child_fail;
1226
1227                 if (chdir(arg_directory) < 0) {
1228                         log_error("chdir(%s) failed: %m", arg_directory);
1229                         goto child_fail;
1230                 }
1231
1232                 if (mount(arg_directory, "/", NULL, MS_MOVE, NULL) < 0) {
1233                         log_error("mount(MS_MOVE) failed: %m");
1234                         goto child_fail;
1235                 }
1236
1237                 if (chroot(".") < 0) {
1238                         log_error("chroot() failed: %m");
1239                         goto child_fail;
1240                 }
1241
1242                 if (chdir("/") < 0) {
1243                         log_error("chdir() failed: %m");
1244                         goto child_fail;
1245                 }
1246
1247                 umask(0022);
1248
1249                 loopback_setup();
1250
1251                 if (drop_capabilities() < 0) {
1252                         log_error("drop_capabilities() failed: %m");
1253                         goto child_fail;
1254                 }
1255
1256                 if (arg_user) {
1257
1258                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1259                                 log_error("get_user_creds() failed: %m");
1260                                 goto child_fail;
1261                         }
1262
1263                         if (mkdir_parents_label(home, 0775) < 0) {
1264                                 log_error("mkdir_parents_label() failed: %m");
1265                                 goto child_fail;
1266                         }
1267
1268                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1269                                 log_error("mkdir_safe_label() failed: %m");
1270                                 goto child_fail;
1271                         }
1272
1273                         if (initgroups((const char*)arg_user, gid) < 0) {
1274                                 log_error("initgroups() failed: %m");
1275                                 goto child_fail;
1276                         }
1277
1278                         if (setresgid(gid, gid, gid) < 0) {
1279                                 log_error("setregid() failed: %m");
1280                                 goto child_fail;
1281                         }
1282
1283                         if (setresuid(uid, uid, uid) < 0) {
1284                                 log_error("setreuid() failed: %m");
1285                                 goto child_fail;
1286                         }
1287                 }
1288
1289                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1290                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1291                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1292                     log_oom();
1293                     goto child_fail;
1294                 }
1295
1296                 if (arg_uuid) {
1297                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1298                                 log_oom();
1299                                 goto child_fail;
1300                         }
1301                 }
1302
1303                 setup_hostname();
1304
1305                 if (arg_boot) {
1306                         char **a;
1307                         size_t l;
1308
1309                         /* Automatically search for the init system */
1310
1311                         l = 1 + argc - optind;
1312                         a = newa(char*, l + 1);
1313                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1314
1315                         a[0] = (char*) "/usr/lib/systemd/systemd";
1316                         execve(a[0], a, (char**) envp);
1317
1318                         a[0] = (char*) "/lib/systemd/systemd";
1319                         execve(a[0], a, (char**) envp);
1320
1321                         a[0] = (char*) "/sbin/init";
1322                         execve(a[0], a, (char**) envp);
1323                 } else if (argc > optind)
1324                         execvpe(argv[optind], argv + optind, (char**) envp);
1325                 else {
1326                         chdir(home ? home : "/root");
1327                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1328                 }
1329
1330                 log_error("execv() failed: %m");
1331
1332         child_fail:
1333                 _exit(EXIT_FAILURE);
1334         }
1335
1336         if (process_pty(master, &mask) < 0)
1337                 goto finish;
1338
1339         if (saved_attr_valid) {
1340                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1341                 saved_attr_valid = false;
1342         }
1343
1344         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1345
1346         if (r < 0)
1347                 r = EXIT_FAILURE;
1348
1349 finish:
1350         if (saved_attr_valid)
1351                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1352
1353         if (master >= 0)
1354                 close_nointr_nofail(master);
1355
1356         close_pipe(kmsg_socket_pair);
1357
1358         if (oldcg)
1359                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1360
1361         if (newcg)
1362                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1363
1364         free(arg_directory);
1365         strv_free(arg_controllers);
1366         free(oldcg);
1367         free(newcg);
1368
1369         return r;
1370 }