chiark / gitweb /
2879db11eb5542447648211757a922ecc73e3b7a
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55
56 static char *arg_directory = NULL;
57 static char *arg_user = NULL;
58 static char **arg_controllers = NULL;
59 static char *arg_uuid = NULL;
60 static bool arg_private_network = false;
61 static bool arg_read_only = false;
62 static bool arg_boot = false;
63 static uint64_t arg_retain =
64         (1ULL << CAP_CHOWN) |
65         (1ULL << CAP_DAC_OVERRIDE) |
66         (1ULL << CAP_DAC_READ_SEARCH) |
67         (1ULL << CAP_FOWNER) |
68         (1ULL << CAP_FSETID) |
69         (1ULL << CAP_IPC_OWNER) |
70         (1ULL << CAP_KILL) |
71         (1ULL << CAP_LEASE) |
72         (1ULL << CAP_LINUX_IMMUTABLE) |
73         (1ULL << CAP_NET_BIND_SERVICE) |
74         (1ULL << CAP_NET_BROADCAST) |
75         (1ULL << CAP_NET_RAW) |
76         (1ULL << CAP_SETGID) |
77         (1ULL << CAP_SETFCAP) |
78         (1ULL << CAP_SETPCAP) |
79         (1ULL << CAP_SETUID) |
80         (1ULL << CAP_SYS_ADMIN) |
81         (1ULL << CAP_SYS_CHROOT) |
82         (1ULL << CAP_SYS_NICE) |
83         (1ULL << CAP_SYS_PTRACE) |
84         (1ULL << CAP_SYS_TTY_CONFIG) |
85         (1ULL << CAP_SYS_RESOURCE);
86
87 static int help(void) {
88
89         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
90                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
91                "  -h --help             Show this help\n"
92                "  -D --directory=NAME   Root directory for the container\n"
93                "  -b --boot             Boot up full system (i.e. invoke init)\n"
94                "  -u --user=USER        Run the command under specified user or uid\n"
95                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
96                "     --uuid=UUID        Set a specific machine UUID for the container\n"
97                "     --private-network  Disable network in container\n"
98                "     --read-only        Mount the root directory read-only\n"
99                "     --capability=CAP   In addition to the default, retain specified capability\n",
100                program_invocation_short_name);
101
102         return 0;
103 }
104
105 static int parse_argv(int argc, char *argv[]) {
106
107         enum {
108                 ARG_PRIVATE_NETWORK = 0x100,
109                 ARG_UUID,
110                 ARG_READ_ONLY,
111                 ARG_CAPABILITY
112         };
113
114         static const struct option options[] = {
115                 { "help",            no_argument,       NULL, 'h'                 },
116                 { "directory",       required_argument, NULL, 'D'                 },
117                 { "user",            required_argument, NULL, 'u'                 },
118                 { "controllers",     required_argument, NULL, 'C'                 },
119                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
120                 { "boot",            no_argument,       NULL, 'b'                 },
121                 { "uuid",            required_argument, NULL, ARG_UUID            },
122                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
123                 { "capability",      required_argument, NULL, ARG_CAPABILITY      },
124                 { NULL,              0,                 NULL, 0                   }
125         };
126
127         int c;
128
129         assert(argc >= 0);
130         assert(argv);
131
132         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
133
134                 switch (c) {
135
136                 case 'h':
137                         help();
138                         return 0;
139
140                 case 'D':
141                         free(arg_directory);
142                         arg_directory = canonicalize_file_name(optarg);
143                         if (!arg_directory) {
144                                 log_error("Failed to canonicalize root directory.");
145                                 return -ENOMEM;
146                         }
147
148                         break;
149
150                 case 'u':
151                         free(arg_user);
152                         if (!(arg_user = strdup(optarg))) {
153                                 log_error("Failed to duplicate user name.");
154                                 return -ENOMEM;
155                         }
156
157                         break;
158
159                 case 'C':
160                         strv_free(arg_controllers);
161                         arg_controllers = strv_split(optarg, ",");
162                         if (!arg_controllers) {
163                                 log_error("Failed to split controllers list.");
164                                 return -ENOMEM;
165                         }
166                         strv_uniq(arg_controllers);
167
168                         break;
169
170                 case ARG_PRIVATE_NETWORK:
171                         arg_private_network = true;
172                         break;
173
174                 case 'b':
175                         arg_boot = true;
176                         break;
177
178                 case ARG_UUID:
179                         arg_uuid = optarg;
180                         break;
181
182                 case ARG_READ_ONLY:
183                         arg_read_only = true;
184                         break;
185
186                 case ARG_CAPABILITY: {
187                         char *state, *word;
188                         size_t length;
189
190                         FOREACH_WORD_SEPARATOR(word, length, optarg, ",", state) {
191                                 cap_value_t cap;
192                                 char *t;
193
194                                 t = strndup(word, length);
195                                 if (!t) {
196                                         log_error("Out of memory.");
197                                         return -ENOMEM;
198                                 }
199
200                                 if (cap_from_name(t, &cap) < 0) {
201                                         log_error("Failed to parse capability %s.", t);
202                                         free(t);
203                                         return -EINVAL;
204                                 }
205
206                                 free(t);
207                                 arg_retain |= 1ULL << (uint64_t) cap;
208                         }
209
210                         break;
211                 }
212
213                 case '?':
214                         return -EINVAL;
215
216                 default:
217                         log_error("Unknown option code %c", c);
218                         return -EINVAL;
219                 }
220         }
221
222         return 1;
223 }
224
225 static int mount_all(const char *dest) {
226
227         typedef struct MountPoint {
228                 const char *what;
229                 const char *where;
230                 const char *type;
231                 const char *options;
232                 unsigned long flags;
233                 bool fatal;
234         } MountPoint;
235
236         static const MountPoint mount_table[] = {
237                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
238                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
239                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
240                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
241                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
242                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
243                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
244                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
245 #ifdef HAVE_SELINUX
246                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
247                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
248 #endif
249         };
250
251         unsigned k;
252         int r = 0;
253         char *where;
254
255         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
256                 int t;
257
258                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
259                         log_error("Out of memory");
260
261                         if (r == 0)
262                                 r = -ENOMEM;
263
264                         break;
265                 }
266
267                 t = path_is_mount_point(where, false);
268                 if (t < 0) {
269                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
270                         free(where);
271
272                         if (r == 0)
273                                 r = t;
274
275                         continue;
276                 }
277
278                 mkdir_p_label(where, 0755);
279
280                 if (mount(mount_table[k].what,
281                           where,
282                           mount_table[k].type,
283                           mount_table[k].flags,
284                           mount_table[k].options) < 0 &&
285                     mount_table[k].fatal) {
286
287                         log_error("mount(%s) failed: %m", where);
288
289                         if (r == 0)
290                                 r = -errno;
291                 }
292
293                 free(where);
294         }
295
296         return r;
297 }
298
299 static int setup_timezone(const char *dest) {
300         char *where;
301
302         assert(dest);
303
304         /* Fix the timezone, if possible */
305         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
306                 log_error("Out of memory");
307                 return -ENOMEM;
308         }
309
310         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
311                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
312
313         free(where);
314
315         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
316                 log_error("Out of memory");
317                 return -ENOMEM;
318         }
319
320         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
321                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
322
323         free(where);
324
325         return 0;
326 }
327
328 static int setup_resolv_conf(const char *dest) {
329         char *where;
330
331         assert(dest);
332
333         if (arg_private_network)
334                 return 0;
335
336         /* Fix resolv.conf, if possible */
337         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
338                 log_error("Out of memory");
339                 return -ENOMEM;
340         }
341
342         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
343                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
344
345         free(where);
346
347         return 0;
348 }
349
350 static int copy_devnodes(const char *dest) {
351
352         static const char devnodes[] =
353                 "null\0"
354                 "zero\0"
355                 "full\0"
356                 "random\0"
357                 "urandom\0"
358                 "tty\0"
359                 "ptmx\0"
360                 "rtc0\0";
361
362         const char *d;
363         int r = 0;
364         mode_t u;
365
366         assert(dest);
367
368         u = umask(0000);
369
370         NULSTR_FOREACH(d, devnodes) {
371                 struct stat st;
372                 char *from = NULL, *to = NULL;
373
374                 asprintf(&from, "/dev/%s", d);
375                 asprintf(&to, "%s/dev/%s", dest, d);
376
377                 if (!from || !to) {
378                         log_error("Failed to allocate devnode path");
379
380                         free(from);
381                         free(to);
382
383                         from = to = NULL;
384
385                         if (r == 0)
386                                 r = -ENOMEM;
387
388                         break;
389                 }
390
391                 if (stat(from, &st) < 0) {
392
393                         if (errno != ENOENT) {
394                                 log_error("Failed to stat %s: %m", from);
395                                 if (r == 0)
396                                         r = -errno;
397                         }
398
399                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
400
401                         log_error("%s is not a char or block device, cannot copy.", from);
402                         if (r == 0)
403                                 r = -EIO;
404
405                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
406
407                         log_error("mknod(%s) failed: %m", dest);
408                         if (r == 0)
409                                 r = -errno;
410                 }
411
412                 free(from);
413                 free(to);
414         }
415
416         umask(u);
417
418         return r;
419 }
420
421 static int setup_dev_console(const char *dest, const char *console) {
422         struct stat st;
423         char *to = NULL;
424         int r;
425         mode_t u;
426
427         assert(dest);
428         assert(console);
429
430         u = umask(0000);
431
432         if (stat(console, &st) < 0) {
433                 log_error("Failed to stat %s: %m", console);
434                 r = -errno;
435                 goto finish;
436
437         } else if (!S_ISCHR(st.st_mode)) {
438                 log_error("/dev/console is not a char device.");
439                 r = -EIO;
440                 goto finish;
441         }
442
443         r = chmod_and_chown(console, 0600, 0, 0);
444         if (r < 0) {
445                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
446                 goto finish;
447         }
448
449         if (asprintf(&to, "%s/dev/console", dest) < 0) {
450                 log_error("Out of memory");
451                 r = -ENOMEM;
452                 goto finish;
453         }
454
455         /* We need to bind mount the right tty to /dev/console since
456          * ptys can only exist on pts file systems. To have something
457          * to bind mount things on we create a device node first, that
458          * has the right major/minor (note that the major minor
459          * doesn't actually matter here, since we mount it over
460          * anyway). */
461
462         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
463                 log_error("mknod() for /dev/console failed: %m");
464                 r = -errno;
465                 goto finish;
466         }
467
468         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
469                 log_error("Bind mount for /dev/console failed: %m");
470                 r = -errno;
471                 goto finish;
472         }
473
474 finish:
475         free(to);
476         umask(u);
477
478         return r;
479 }
480
481 static int setup_kmsg(const char *dest, int kmsg_socket) {
482         char *from = NULL, *to = NULL;
483         int r, fd, k;
484         mode_t u;
485         union {
486                 struct cmsghdr cmsghdr;
487                 uint8_t buf[CMSG_SPACE(sizeof(int))];
488         } control;
489         struct msghdr mh;
490         struct cmsghdr *cmsg;
491
492         assert(dest);
493         assert(kmsg_socket >= 0);
494
495         u = umask(0000);
496
497         /* We create the kmsg FIFO as /dev/kmsg, but immediately
498          * delete it after bind mounting it to /proc/kmsg. While FIFOs
499          * on the reading side behave very similar to /proc/kmsg,
500          * their writing side behaves differently from /dev/kmsg in
501          * that writing blocks when nothing is reading. In order to
502          * avoid any problems with containers deadlocking due to this
503          * we simply make /dev/kmsg unavailable to the container. */
504         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
505                 log_error("Out of memory");
506                 r = -ENOMEM;
507                 goto finish;
508         }
509
510         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
511                 log_error("Out of memory");
512                 r = -ENOMEM;
513                 goto finish;
514         }
515
516         if (mkfifo(from, 0600) < 0) {
517                 log_error("mkfifo() for /dev/kmsg failed: %m");
518                 r = -errno;
519                 goto finish;
520         }
521
522         r = chmod_and_chown(from, 0600, 0, 0);
523         if (r < 0) {
524                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
525                 goto finish;
526         }
527
528         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
529                 log_error("Bind mount for /proc/kmsg failed: %m");
530                 r = -errno;
531                 goto finish;
532         }
533
534         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
535         if (fd < 0) {
536                 log_error("Failed to open fifo: %m");
537                 r = -errno;
538                 goto finish;
539         }
540
541         zero(mh);
542         zero(control);
543
544         mh.msg_control = &control;
545         mh.msg_controllen = sizeof(control);
546
547         cmsg = CMSG_FIRSTHDR(&mh);
548         cmsg->cmsg_level = SOL_SOCKET;
549         cmsg->cmsg_type = SCM_RIGHTS;
550         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
551         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
552
553         mh.msg_controllen = cmsg->cmsg_len;
554
555         /* Store away the fd in the socket, so that it stays open as
556          * long as we run the child */
557         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
558         close_nointr_nofail(fd);
559
560         if (k < 0) {
561                 log_error("Failed to send FIFO fd: %m");
562                 r = -errno;
563                 goto finish;
564         }
565
566         /* And now make the FIFO unavailable as /dev/kmsg... */
567         unlink(from);
568
569 finish:
570         free(from);
571         free(to);
572         umask(u);
573
574         return r;
575 }
576
577 static int setup_hostname(void) {
578         char *hn;
579         int r = 0;
580
581         hn = path_get_file_name(arg_directory);
582         if (hn) {
583                 hn = strdup(hn);
584                 if (!hn)
585                         return -ENOMEM;
586
587                 hostname_cleanup(hn);
588
589                 if (!isempty(hn))
590                         if (sethostname(hn, strlen(hn)) < 0)
591                                 r = -errno;
592
593                 free(hn);
594         }
595
596         return r;
597 }
598
599 static int drop_capabilities(void) {
600         return capability_bounding_set_drop(~arg_retain, false);
601 }
602
603 static int is_os_tree(const char *path) {
604         int r;
605         char *p;
606         /* We use /bin/sh as flag file if something is an OS */
607
608         if (asprintf(&p, "%s/bin/sh", path) < 0)
609                 return -ENOMEM;
610
611         r = access(p, F_OK);
612         free(p);
613
614         return r < 0 ? 0 : 1;
615 }
616
617 static int process_pty(int master, sigset_t *mask) {
618
619         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
620         size_t in_buffer_full = 0, out_buffer_full = 0;
621         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
622         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
623         int ep = -1, signal_fd = -1, r;
624
625         fd_nonblock(STDIN_FILENO, 1);
626         fd_nonblock(STDOUT_FILENO, 1);
627         fd_nonblock(master, 1);
628
629         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
630                 log_error("signalfd(): %m");
631                 r = -errno;
632                 goto finish;
633         }
634
635         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
636                 log_error("Failed to create epoll: %m");
637                 r = -errno;
638                 goto finish;
639         }
640
641         zero(stdin_ev);
642         stdin_ev.events = EPOLLIN|EPOLLET;
643         stdin_ev.data.fd = STDIN_FILENO;
644
645         zero(stdout_ev);
646         stdout_ev.events = EPOLLOUT|EPOLLET;
647         stdout_ev.data.fd = STDOUT_FILENO;
648
649         zero(master_ev);
650         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
651         master_ev.data.fd = master;
652
653         zero(signal_ev);
654         signal_ev.events = EPOLLIN;
655         signal_ev.data.fd = signal_fd;
656
657         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
658             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
659             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
660             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
661                 log_error("Failed to regiser fds in epoll: %m");
662                 r = -errno;
663                 goto finish;
664         }
665
666         for (;;) {
667                 struct epoll_event ev[16];
668                 ssize_t k;
669                 int i, nfds;
670
671                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
672
673                         if (errno == EINTR || errno == EAGAIN)
674                                 continue;
675
676                         log_error("epoll_wait(): %m");
677                         r = -errno;
678                         goto finish;
679                 }
680
681                 assert(nfds >= 1);
682
683                 for (i = 0; i < nfds; i++) {
684                         if (ev[i].data.fd == STDIN_FILENO) {
685
686                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
687                                         stdin_readable = true;
688
689                         } else if (ev[i].data.fd == STDOUT_FILENO) {
690
691                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
692                                         stdout_writable = true;
693
694                         } else if (ev[i].data.fd == master) {
695
696                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
697                                         master_readable = true;
698
699                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
700                                         master_writable = true;
701
702                         } else if (ev[i].data.fd == signal_fd) {
703                                 struct signalfd_siginfo sfsi;
704                                 ssize_t n;
705
706                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
707
708                                         if (n >= 0) {
709                                                 log_error("Failed to read from signalfd: invalid block size");
710                                                 r = -EIO;
711                                                 goto finish;
712                                         }
713
714                                         if (errno != EINTR && errno != EAGAIN) {
715                                                 log_error("Failed to read from signalfd: %m");
716                                                 r = -errno;
717                                                 goto finish;
718                                         }
719                                 } else {
720
721                                         if (sfsi.ssi_signo == SIGWINCH) {
722                                                 struct winsize ws;
723
724                                                 /* The window size changed, let's forward that. */
725                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
726                                                         ioctl(master, TIOCSWINSZ, &ws);
727                                         } else {
728                                                 r = 0;
729                                                 goto finish;
730                                         }
731                                 }
732                         }
733                 }
734
735                 while ((stdin_readable && in_buffer_full <= 0) ||
736                        (master_writable && in_buffer_full > 0) ||
737                        (master_readable && out_buffer_full <= 0) ||
738                        (stdout_writable && out_buffer_full > 0)) {
739
740                         if (stdin_readable && in_buffer_full < LINE_MAX) {
741
742                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
743
744                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
745                                                 stdin_readable = false;
746                                         else {
747                                                 log_error("read(): %m");
748                                                 r = -errno;
749                                                 goto finish;
750                                         }
751                                 } else
752                                         in_buffer_full += (size_t) k;
753                         }
754
755                         if (master_writable && in_buffer_full > 0) {
756
757                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
758
759                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
760                                                 master_writable = false;
761                                         else {
762                                                 log_error("write(): %m");
763                                                 r = -errno;
764                                                 goto finish;
765                                         }
766
767                                 } else {
768                                         assert(in_buffer_full >= (size_t) k);
769                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
770                                         in_buffer_full -= k;
771                                 }
772                         }
773
774                         if (master_readable && out_buffer_full < LINE_MAX) {
775
776                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
777
778                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
779                                                 master_readable = false;
780                                         else {
781                                                 log_error("read(): %m");
782                                                 r = -errno;
783                                                 goto finish;
784                                         }
785                                 }  else
786                                         out_buffer_full += (size_t) k;
787                         }
788
789                         if (stdout_writable && out_buffer_full > 0) {
790
791                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
792
793                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
794                                                 stdout_writable = false;
795                                         else {
796                                                 log_error("write(): %m");
797                                                 r = -errno;
798                                                 goto finish;
799                                         }
800
801                                 } else {
802                                         assert(out_buffer_full >= (size_t) k);
803                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
804                                         out_buffer_full -= k;
805                                 }
806                         }
807                 }
808         }
809
810 finish:
811         if (ep >= 0)
812                 close_nointr_nofail(ep);
813
814         if (signal_fd >= 0)
815                 close_nointr_nofail(signal_fd);
816
817         return r;
818 }
819
820 int main(int argc, char *argv[]) {
821         pid_t pid = 0;
822         int r = EXIT_FAILURE, k;
823         char *oldcg = NULL, *newcg = NULL;
824         char **controller = NULL;
825         int master = -1;
826         const char *console = NULL;
827         struct termios saved_attr, raw_attr;
828         sigset_t mask;
829         bool saved_attr_valid = false;
830         struct winsize ws;
831         int kmsg_socket_pair[2] = { -1, -1 };
832
833         log_parse_environment();
834         log_open();
835
836         if ((r = parse_argv(argc, argv)) <= 0)
837                 goto finish;
838
839         if (arg_directory) {
840                 char *p;
841
842                 p = path_make_absolute_cwd(arg_directory);
843                 free(arg_directory);
844                 arg_directory = p;
845         } else
846                 arg_directory = get_current_dir_name();
847
848         if (!arg_directory) {
849                 log_error("Failed to determine path");
850                 goto finish;
851         }
852
853         path_kill_slashes(arg_directory);
854
855         if (geteuid() != 0) {
856                 log_error("Need to be root.");
857                 goto finish;
858         }
859
860         if (sd_booted() <= 0) {
861                 log_error("Not running on a systemd system.");
862                 goto finish;
863         }
864
865         if (path_equal(arg_directory, "/")) {
866                 log_error("Spawning container on root directory not supported.");
867                 goto finish;
868         }
869
870         if (is_os_tree(arg_directory) <= 0) {
871                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
872                 goto finish;
873         }
874
875         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
876                 log_error("Failed to determine current cgroup: %s", strerror(-k));
877                 goto finish;
878         }
879
880         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
881                 log_error("Failed to allocate cgroup path.");
882                 goto finish;
883         }
884
885         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
886         if (k < 0)  {
887                 log_error("Failed to create cgroup: %s", strerror(-k));
888                 goto finish;
889         }
890
891         STRV_FOREACH(controller,arg_controllers) {
892                 k = cg_create_and_attach(*controller, newcg, 0);
893                 if (k < 0)
894                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
895         }
896
897         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
898                 log_error("Failed to acquire pseudo tty: %m");
899                 goto finish;
900         }
901
902         if (!(console = ptsname(master))) {
903                 log_error("Failed to determine tty name: %m");
904                 goto finish;
905         }
906
907         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
908
909         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
910                 ioctl(master, TIOCSWINSZ, &ws);
911
912         if (unlockpt(master) < 0) {
913                 log_error("Failed to unlock tty: %m");
914                 goto finish;
915         }
916
917         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
918                 log_error("Failed to get terminal attributes: %m");
919                 goto finish;
920         }
921
922         saved_attr_valid = true;
923
924         raw_attr = saved_attr;
925         cfmakeraw(&raw_attr);
926         raw_attr.c_lflag &= ~ECHO;
927
928         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
929                 log_error("Failed to set terminal attributes: %m");
930                 goto finish;
931         }
932
933         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
934                 log_error("Failed to create kmsg socket pair");
935                 goto finish;
936         }
937
938         assert_se(sigemptyset(&mask) == 0);
939         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
940         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
941
942         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
943         if (pid < 0) {
944                 if (errno == EINVAL)
945                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
946                 else
947                         log_error("clone() failed: %m");
948
949                 goto finish;
950         }
951
952         if (pid == 0) {
953                 /* child */
954
955                 const char *home = NULL;
956                 uid_t uid = (uid_t) -1;
957                 gid_t gid = (gid_t) -1;
958                 const char *envp[] = {
959                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
960                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
961                         NULL, /* TERM */
962                         NULL, /* HOME */
963                         NULL, /* USER */
964                         NULL, /* LOGNAME */
965                         NULL, /* container_uuid */
966                         NULL
967                 };
968
969                 envp[2] = strv_find_prefix(environ, "TERM=");
970
971                 close_nointr_nofail(master);
972
973                 close_nointr(STDIN_FILENO);
974                 close_nointr(STDOUT_FILENO);
975                 close_nointr(STDERR_FILENO);
976
977                 close_all_fds(&kmsg_socket_pair[1], 1);
978
979                 reset_all_signal_handlers();
980
981                 assert_se(sigemptyset(&mask) == 0);
982                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
983
984                 if (setsid() < 0)
985                         goto child_fail;
986
987                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
988                         goto child_fail;
989
990                 /* Mark / as private, in case somebody marked it shared */
991                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
992                         goto child_fail;
993
994                 /* Turn directory into bind mount */
995                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
996                         log_error("Failed to make bind mount.");
997                         goto child_fail;
998                 }
999
1000                 if (arg_read_only)
1001                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
1002                                 log_error("Failed to make read-only.");
1003                                 goto child_fail;
1004                         }
1005
1006                 if (mount_all(arg_directory) < 0)
1007                         goto child_fail;
1008
1009                 if (copy_devnodes(arg_directory) < 0)
1010                         goto child_fail;
1011
1012                 if (setup_dev_console(arg_directory, console) < 0)
1013                         goto child_fail;
1014
1015                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1016                         goto child_fail;
1017
1018                 close_nointr_nofail(kmsg_socket_pair[1]);
1019
1020                 if (setup_timezone(arg_directory) < 0)
1021                         goto child_fail;
1022
1023                 if (setup_resolv_conf(arg_directory) < 0)
1024                         goto child_fail;
1025
1026                 if (chdir(arg_directory) < 0) {
1027                         log_error("chdir(%s) failed: %m", arg_directory);
1028                         goto child_fail;
1029                 }
1030
1031                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1032                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1033                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1034                         goto child_fail;
1035
1036                 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1037                         log_error("mount(MS_BIND) failed: %m");
1038                         goto child_fail;
1039                 }
1040
1041                 if (chroot(".") < 0) {
1042                         log_error("chroot() failed: %m");
1043                         goto child_fail;
1044                 }
1045
1046                 if (chdir("/") < 0) {
1047                         log_error("chdir() failed: %m");
1048                         goto child_fail;
1049                 }
1050
1051                 umask(0022);
1052
1053                 loopback_setup();
1054
1055                 if (drop_capabilities() < 0) {
1056                         log_error("drop_capabilities() failed: %m");
1057                         goto child_fail;
1058                 }
1059
1060                 if (arg_user) {
1061
1062                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home, NULL) < 0) {
1063                                 log_error("get_user_creds() failed: %m");
1064                                 goto child_fail;
1065                         }
1066
1067                         if (mkdir_parents_label(home, 0775) < 0) {
1068                                 log_error("mkdir_parents_label() failed: %m");
1069                                 goto child_fail;
1070                         }
1071
1072                         if (mkdir_safe_label(home, 0775, uid, gid) < 0) {
1073                                 log_error("mkdir_safe_label() failed: %m");
1074                                 goto child_fail;
1075                         }
1076
1077                         if (initgroups((const char*)arg_user, gid) < 0) {
1078                                 log_error("initgroups() failed: %m");
1079                                 goto child_fail;
1080                         }
1081
1082                         if (setresgid(gid, gid, gid) < 0) {
1083                                 log_error("setregid() failed: %m");
1084                                 goto child_fail;
1085                         }
1086
1087                         if (setresuid(uid, uid, uid) < 0) {
1088                                 log_error("setreuid() failed: %m");
1089                                 goto child_fail;
1090                         }
1091                 }
1092
1093                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1094                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1095                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1096                     log_error("Out of memory");
1097                     goto child_fail;
1098                 }
1099
1100                 if (arg_uuid) {
1101                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1102                                 log_error("Out of memory");
1103                                 goto child_fail;
1104                         }
1105                 }
1106
1107                 setup_hostname();
1108
1109                 if (arg_boot) {
1110                         char **a;
1111                         size_t l;
1112
1113                         /* Automatically search for the init system */
1114
1115                         l = 1 + argc - optind;
1116                         a = newa(char*, l + 1);
1117                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1118
1119                         a[0] = (char*) "/usr/lib/systemd/systemd";
1120                         execve(a[0], a, (char**) envp);
1121
1122                         a[0] = (char*) "/lib/systemd/systemd";
1123                         execve(a[0], a, (char**) envp);
1124
1125                         a[0] = (char*) "/sbin/init";
1126                         execve(a[0], a, (char**) envp);
1127                 } else if (argc > optind)
1128                         execvpe(argv[optind], argv + optind, (char**) envp);
1129                 else {
1130                         chdir(home ? home : "/root");
1131                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1132                 }
1133
1134                 log_error("execv() failed: %m");
1135
1136         child_fail:
1137                 _exit(EXIT_FAILURE);
1138         }
1139
1140         if (process_pty(master, &mask) < 0)
1141                 goto finish;
1142
1143         if (saved_attr_valid) {
1144                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1145                 saved_attr_valid = false;
1146         }
1147
1148         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1149
1150         if (r < 0)
1151                 r = EXIT_FAILURE;
1152
1153 finish:
1154         if (saved_attr_valid)
1155                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1156
1157         if (master >= 0)
1158                 close_nointr_nofail(master);
1159
1160         close_pipe(kmsg_socket_pair);
1161
1162         if (oldcg)
1163                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1164
1165         if (newcg)
1166                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1167
1168         free(arg_directory);
1169         strv_free(arg_controllers);
1170         free(oldcg);
1171         free(newcg);
1172
1173         return r;
1174 }