chiark / gitweb /
main: add configuration option to alter capability bounding set for PID 1
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55
56 static char *arg_directory = NULL;
57 static char *arg_user = NULL;
58 static char **arg_controllers = NULL;
59 static char *arg_uuid = NULL;
60 static bool arg_private_network = false;
61 static bool arg_read_only = false;
62 static bool arg_boot = false;
63
64 static int help(void) {
65
66         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
67                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
68                "  -h --help             Show this help\n"
69                "  -D --directory=NAME   Root directory for the container\n"
70                "  -b --boot             Boot up full system (i.e. invoke init)\n"
71                "  -u --user=USER        Run the command under specified user or uid\n"
72                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
73                "     --uuid=UUID        Set a specific machine UUID for the container\n"
74                "     --private-network  Disable network in container\n"
75                "     --read-only        Mount the root directory read-only\n",
76                program_invocation_short_name);
77
78         return 0;
79 }
80
81 static int parse_argv(int argc, char *argv[]) {
82
83         enum {
84                 ARG_PRIVATE_NETWORK = 0x100,
85                 ARG_UUID,
86                 ARG_READ_ONLY
87         };
88
89         static const struct option options[] = {
90                 { "help",            no_argument,       NULL, 'h'                 },
91                 { "directory",       required_argument, NULL, 'D'                 },
92                 { "user",            required_argument, NULL, 'u'                 },
93                 { "controllers",     required_argument, NULL, 'C'                 },
94                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
95                 { "boot",            no_argument,       NULL, 'b'                 },
96                 { "uuid",            required_argument, NULL, ARG_UUID            },
97                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
98                 { NULL,              0,                 NULL, 0                   }
99         };
100
101         int c;
102
103         assert(argc >= 0);
104         assert(argv);
105
106         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
107
108                 switch (c) {
109
110                 case 'h':
111                         help();
112                         return 0;
113
114                 case 'D':
115                         free(arg_directory);
116                         arg_directory = canonicalize_file_name(optarg);
117                         if (!arg_directory) {
118                                 log_error("Failed to canonicalize root directory.");
119                                 return -ENOMEM;
120                         }
121
122                         break;
123
124                 case 'u':
125                         free(arg_user);
126                         if (!(arg_user = strdup(optarg))) {
127                                 log_error("Failed to duplicate user name.");
128                                 return -ENOMEM;
129                         }
130
131                         break;
132
133                 case 'C':
134                         strv_free(arg_controllers);
135                         arg_controllers = strv_split(optarg, ",");
136                         if (!arg_controllers) {
137                                 log_error("Failed to split controllers list.");
138                                 return -ENOMEM;
139                         }
140                         strv_uniq(arg_controllers);
141
142                         break;
143
144                 case ARG_PRIVATE_NETWORK:
145                         arg_private_network = true;
146                         break;
147
148                 case 'b':
149                         arg_boot = true;
150                         break;
151
152                 case ARG_UUID:
153                         arg_uuid = optarg;
154                         break;
155
156                 case ARG_READ_ONLY:
157                         arg_read_only = true;
158                         break;
159
160                 case '?':
161                         return -EINVAL;
162
163                 default:
164                         log_error("Unknown option code %c", c);
165                         return -EINVAL;
166                 }
167         }
168
169         return 1;
170 }
171
172 static int mount_all(const char *dest) {
173
174         typedef struct MountPoint {
175                 const char *what;
176                 const char *where;
177                 const char *type;
178                 const char *options;
179                 unsigned long flags;
180                 bool fatal;
181         } MountPoint;
182
183         static const MountPoint mount_table[] = {
184                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
185                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
186                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
187                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
188                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
189                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
190                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
191                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
192 #ifdef HAVE_SELINUX
193                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
194                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
195 #endif
196         };
197
198         unsigned k;
199         int r = 0;
200         char *where;
201
202         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
203                 int t;
204
205                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
206                         log_error("Out of memory");
207
208                         if (r == 0)
209                                 r = -ENOMEM;
210
211                         break;
212                 }
213
214                 t = path_is_mount_point(where, false);
215                 if (t < 0) {
216                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
217                         free(where);
218
219                         if (r == 0)
220                                 r = t;
221
222                         continue;
223                 }
224
225                 mkdir_p(where, 0755);
226
227                 if (mount(mount_table[k].what,
228                           where,
229                           mount_table[k].type,
230                           mount_table[k].flags,
231                           mount_table[k].options) < 0 &&
232                     mount_table[k].fatal) {
233
234                         log_error("mount(%s) failed: %m", where);
235
236                         if (r == 0)
237                                 r = -errno;
238                 }
239
240                 free(where);
241         }
242
243         return r;
244 }
245
246 static int setup_timezone(const char *dest) {
247         char *where;
248
249         assert(dest);
250
251         /* Fix the timezone, if possible */
252         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
253                 log_error("Out of memory");
254                 return -ENOMEM;
255         }
256
257         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
258                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
259
260         free(where);
261
262         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
263                 log_error("Out of memory");
264                 return -ENOMEM;
265         }
266
267         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
268                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
269
270         free(where);
271
272         return 0;
273 }
274
275 static int setup_resolv_conf(const char *dest) {
276         char *where;
277
278         assert(dest);
279
280         if (arg_private_network)
281                 return 0;
282
283         /* Fix resolv.conf, if possible */
284         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
285                 log_error("Out of memory");
286                 return -ENOMEM;
287         }
288
289         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
290                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
291
292         free(where);
293
294         return 0;
295 }
296
297 static int copy_devnodes(const char *dest) {
298
299         static const char devnodes[] =
300                 "null\0"
301                 "zero\0"
302                 "full\0"
303                 "random\0"
304                 "urandom\0"
305                 "tty\0"
306                 "ptmx\0"
307                 "rtc0\0";
308
309         const char *d;
310         int r = 0;
311         mode_t u;
312
313         assert(dest);
314
315         u = umask(0000);
316
317         NULSTR_FOREACH(d, devnodes) {
318                 struct stat st;
319                 char *from = NULL, *to = NULL;
320
321                 asprintf(&from, "/dev/%s", d);
322                 asprintf(&to, "%s/dev/%s", dest, d);
323
324                 if (!from || !to) {
325                         log_error("Failed to allocate devnode path");
326
327                         free(from);
328                         free(to);
329
330                         from = to = NULL;
331
332                         if (r == 0)
333                                 r = -ENOMEM;
334
335                         break;
336                 }
337
338                 if (stat(from, &st) < 0) {
339
340                         if (errno != ENOENT) {
341                                 log_error("Failed to stat %s: %m", from);
342                                 if (r == 0)
343                                         r = -errno;
344                         }
345
346                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
347
348                         log_error("%s is not a char or block device, cannot copy.", from);
349                         if (r == 0)
350                                 r = -EIO;
351
352                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
353
354                         log_error("mknod(%s) failed: %m", dest);
355                         if (r == 0)
356                                 r = -errno;
357                 }
358
359                 free(from);
360                 free(to);
361         }
362
363         umask(u);
364
365         return r;
366 }
367
368 static int setup_dev_console(const char *dest, const char *console) {
369         struct stat st;
370         char *to = NULL;
371         int r;
372         mode_t u;
373
374         assert(dest);
375         assert(console);
376
377         u = umask(0000);
378
379         if (stat(console, &st) < 0) {
380                 log_error("Failed to stat %s: %m", console);
381                 r = -errno;
382                 goto finish;
383
384         } else if (!S_ISCHR(st.st_mode)) {
385                 log_error("/dev/console is not a char device.");
386                 r = -EIO;
387                 goto finish;
388         }
389
390         r = chmod_and_chown(console, 0600, 0, 0);
391         if (r < 0) {
392                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
393                 goto finish;
394         }
395
396         if (asprintf(&to, "%s/dev/console", dest) < 0) {
397                 log_error("Out of memory");
398                 r = -ENOMEM;
399                 goto finish;
400         }
401
402         /* We need to bind mount the right tty to /dev/console since
403          * ptys can only exist on pts file systems. To have something
404          * to bind mount things on we create a device node first, that
405          * has the right major/minor (note that the major minor
406          * doesn't actually matter here, since we mount it over
407          * anyway). */
408
409         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
410                 log_error("mknod() for /dev/console failed: %m");
411                 r = -errno;
412                 goto finish;
413         }
414
415         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
416                 log_error("Bind mount for /dev/console failed: %m");
417                 r = -errno;
418                 goto finish;
419         }
420
421 finish:
422         free(to);
423         umask(u);
424
425         return r;
426 }
427
428 static int setup_kmsg(const char *dest, int kmsg_socket) {
429         char *from = NULL, *to = NULL;
430         int r, fd, k;
431         mode_t u;
432         union {
433                 struct cmsghdr cmsghdr;
434                 uint8_t buf[CMSG_SPACE(sizeof(int))];
435         } control;
436         struct msghdr mh;
437         struct cmsghdr *cmsg;
438
439         assert(dest);
440         assert(kmsg_socket >= 0);
441
442         u = umask(0000);
443
444         /* We create the kmsg FIFO as /dev/kmsg, but immediately
445          * delete it after bind mounting it to /proc/kmsg. While FIFOs
446          * on the reading side behave very similar to /proc/kmsg,
447          * their writing side behaves differently from /dev/kmsg in
448          * that writing blocks when nothing is reading. In order to
449          * avoid any problems with containers deadlocking due to this
450          * we simply make /dev/kmsg unavailable to the container. */
451         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
452                 log_error("Out of memory");
453                 r = -ENOMEM;
454                 goto finish;
455         }
456
457         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
458                 log_error("Out of memory");
459                 r = -ENOMEM;
460                 goto finish;
461         }
462
463         if (mkfifo(from, 0600) < 0) {
464                 log_error("mkfifo() for /dev/kmsg failed: %m");
465                 r = -errno;
466                 goto finish;
467         }
468
469         r = chmod_and_chown(from, 0600, 0, 0);
470         if (r < 0) {
471                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
472                 goto finish;
473         }
474
475         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
476                 log_error("Bind mount for /proc/kmsg failed: %m");
477                 r = -errno;
478                 goto finish;
479         }
480
481         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
482         if (fd < 0) {
483                 log_error("Failed to open fifo: %m");
484                 r = -errno;
485                 goto finish;
486         }
487
488         zero(mh);
489         zero(control);
490
491         mh.msg_control = &control;
492         mh.msg_controllen = sizeof(control);
493
494         cmsg = CMSG_FIRSTHDR(&mh);
495         cmsg->cmsg_level = SOL_SOCKET;
496         cmsg->cmsg_type = SCM_RIGHTS;
497         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
498         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
499
500         mh.msg_controllen = cmsg->cmsg_len;
501
502         /* Store away the fd in the socket, so that it stays open as
503          * long as we run the child */
504         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
505         close_nointr_nofail(fd);
506
507         if (k < 0) {
508                 log_error("Failed to send FIFO fd: %m");
509                 r = -errno;
510                 goto finish;
511         }
512
513         /* And now make the FIFO unavailable as /dev/kmsg... */
514         unlink(from);
515
516 finish:
517         free(from);
518         free(to);
519         umask(u);
520
521         return r;
522 }
523
524 static int setup_hostname(void) {
525         char *hn;
526         int r = 0;
527
528         hn = path_get_file_name(arg_directory);
529         if (hn) {
530                 hn = strdup(hn);
531                 if (!hn)
532                         return -ENOMEM;
533
534                 hostname_cleanup(hn);
535
536                 if (!isempty(hn))
537                         if (sethostname(hn, strlen(hn)) < 0)
538                                 r = -errno;
539
540                 free(hn);
541         }
542
543         return r;
544 }
545
546 static int drop_capabilities(void) {
547
548         static const uint64_t retain =
549                 (1ULL << CAP_CHOWN) |
550                 (1ULL << CAP_DAC_OVERRIDE) |
551                 (1ULL << CAP_DAC_READ_SEARCH) |
552                 (1ULL << CAP_FOWNER) |
553                 (1ULL << CAP_FSETID) |
554                 (1ULL << CAP_IPC_OWNER) |
555                 (1ULL << CAP_KILL) |
556                 (1ULL << CAP_LEASE) |
557                 (1ULL << CAP_LINUX_IMMUTABLE) |
558                 (1ULL << CAP_NET_BIND_SERVICE) |
559                 (1ULL << CAP_NET_BROADCAST) |
560                 (1ULL << CAP_NET_RAW) |
561                 (1ULL << CAP_SETGID) |
562                 (1ULL << CAP_SETFCAP) |
563                 (1ULL << CAP_SETPCAP) |
564                 (1ULL << CAP_SETUID) |
565                 (1ULL << CAP_SYS_ADMIN) |
566                 (1ULL << CAP_SYS_CHROOT) |
567                 (1ULL << CAP_SYS_NICE) |
568                 (1ULL << CAP_SYS_PTRACE) |
569                 (1ULL << CAP_SYS_TTY_CONFIG);
570
571         return capability_bounding_set_drop(~retain, false);
572 }
573
574 static int is_os_tree(const char *path) {
575         int r;
576         char *p;
577         /* We use /bin/sh as flag file if something is an OS */
578
579         if (asprintf(&p, "%s/bin/sh", path) < 0)
580                 return -ENOMEM;
581
582         r = access(p, F_OK);
583         free(p);
584
585         return r < 0 ? 0 : 1;
586 }
587
588 static int process_pty(int master, sigset_t *mask) {
589
590         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
591         size_t in_buffer_full = 0, out_buffer_full = 0;
592         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
593         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
594         int ep = -1, signal_fd = -1, r;
595
596         fd_nonblock(STDIN_FILENO, 1);
597         fd_nonblock(STDOUT_FILENO, 1);
598         fd_nonblock(master, 1);
599
600         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
601                 log_error("signalfd(): %m");
602                 r = -errno;
603                 goto finish;
604         }
605
606         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
607                 log_error("Failed to create epoll: %m");
608                 r = -errno;
609                 goto finish;
610         }
611
612         zero(stdin_ev);
613         stdin_ev.events = EPOLLIN|EPOLLET;
614         stdin_ev.data.fd = STDIN_FILENO;
615
616         zero(stdout_ev);
617         stdout_ev.events = EPOLLOUT|EPOLLET;
618         stdout_ev.data.fd = STDOUT_FILENO;
619
620         zero(master_ev);
621         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
622         master_ev.data.fd = master;
623
624         zero(signal_ev);
625         signal_ev.events = EPOLLIN;
626         signal_ev.data.fd = signal_fd;
627
628         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
629             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
630             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
631             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
632                 log_error("Failed to regiser fds in epoll: %m");
633                 r = -errno;
634                 goto finish;
635         }
636
637         for (;;) {
638                 struct epoll_event ev[16];
639                 ssize_t k;
640                 int i, nfds;
641
642                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
643
644                         if (errno == EINTR || errno == EAGAIN)
645                                 continue;
646
647                         log_error("epoll_wait(): %m");
648                         r = -errno;
649                         goto finish;
650                 }
651
652                 assert(nfds >= 1);
653
654                 for (i = 0; i < nfds; i++) {
655                         if (ev[i].data.fd == STDIN_FILENO) {
656
657                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
658                                         stdin_readable = true;
659
660                         } else if (ev[i].data.fd == STDOUT_FILENO) {
661
662                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
663                                         stdout_writable = true;
664
665                         } else if (ev[i].data.fd == master) {
666
667                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
668                                         master_readable = true;
669
670                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
671                                         master_writable = true;
672
673                         } else if (ev[i].data.fd == signal_fd) {
674                                 struct signalfd_siginfo sfsi;
675                                 ssize_t n;
676
677                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
678
679                                         if (n >= 0) {
680                                                 log_error("Failed to read from signalfd: invalid block size");
681                                                 r = -EIO;
682                                                 goto finish;
683                                         }
684
685                                         if (errno != EINTR && errno != EAGAIN) {
686                                                 log_error("Failed to read from signalfd: %m");
687                                                 r = -errno;
688                                                 goto finish;
689                                         }
690                                 } else {
691
692                                         if (sfsi.ssi_signo == SIGWINCH) {
693                                                 struct winsize ws;
694
695                                                 /* The window size changed, let's forward that. */
696                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
697                                                         ioctl(master, TIOCSWINSZ, &ws);
698                                         } else {
699                                                 r = 0;
700                                                 goto finish;
701                                         }
702                                 }
703                         }
704                 }
705
706                 while ((stdin_readable && in_buffer_full <= 0) ||
707                        (master_writable && in_buffer_full > 0) ||
708                        (master_readable && out_buffer_full <= 0) ||
709                        (stdout_writable && out_buffer_full > 0)) {
710
711                         if (stdin_readable && in_buffer_full < LINE_MAX) {
712
713                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
714
715                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
716                                                 stdin_readable = false;
717                                         else {
718                                                 log_error("read(): %m");
719                                                 r = -errno;
720                                                 goto finish;
721                                         }
722                                 } else
723                                         in_buffer_full += (size_t) k;
724                         }
725
726                         if (master_writable && in_buffer_full > 0) {
727
728                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
729
730                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
731                                                 master_writable = false;
732                                         else {
733                                                 log_error("write(): %m");
734                                                 r = -errno;
735                                                 goto finish;
736                                         }
737
738                                 } else {
739                                         assert(in_buffer_full >= (size_t) k);
740                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
741                                         in_buffer_full -= k;
742                                 }
743                         }
744
745                         if (master_readable && out_buffer_full < LINE_MAX) {
746
747                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
748
749                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
750                                                 master_readable = false;
751                                         else {
752                                                 log_error("read(): %m");
753                                                 r = -errno;
754                                                 goto finish;
755                                         }
756                                 }  else
757                                         out_buffer_full += (size_t) k;
758                         }
759
760                         if (stdout_writable && out_buffer_full > 0) {
761
762                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
763
764                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
765                                                 stdout_writable = false;
766                                         else {
767                                                 log_error("write(): %m");
768                                                 r = -errno;
769                                                 goto finish;
770                                         }
771
772                                 } else {
773                                         assert(out_buffer_full >= (size_t) k);
774                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
775                                         out_buffer_full -= k;
776                                 }
777                         }
778                 }
779         }
780
781 finish:
782         if (ep >= 0)
783                 close_nointr_nofail(ep);
784
785         if (signal_fd >= 0)
786                 close_nointr_nofail(signal_fd);
787
788         return r;
789 }
790
791 int main(int argc, char *argv[]) {
792         pid_t pid = 0;
793         int r = EXIT_FAILURE, k;
794         char *oldcg = NULL, *newcg = NULL;
795         char **controller = NULL;
796         int master = -1;
797         const char *console = NULL;
798         struct termios saved_attr, raw_attr;
799         sigset_t mask;
800         bool saved_attr_valid = false;
801         struct winsize ws;
802         int kmsg_socket_pair[2] = { -1, -1 };
803
804         log_parse_environment();
805         log_open();
806
807         if ((r = parse_argv(argc, argv)) <= 0)
808                 goto finish;
809
810         if (arg_directory) {
811                 char *p;
812
813                 p = path_make_absolute_cwd(arg_directory);
814                 free(arg_directory);
815                 arg_directory = p;
816         } else
817                 arg_directory = get_current_dir_name();
818
819         if (!arg_directory) {
820                 log_error("Failed to determine path");
821                 goto finish;
822         }
823
824         path_kill_slashes(arg_directory);
825
826         if (geteuid() != 0) {
827                 log_error("Need to be root.");
828                 goto finish;
829         }
830
831         if (sd_booted() <= 0) {
832                 log_error("Not running on a systemd system.");
833                 goto finish;
834         }
835
836         if (path_equal(arg_directory, "/")) {
837                 log_error("Spawning container on root directory not supported.");
838                 goto finish;
839         }
840
841         if (is_os_tree(arg_directory) <= 0) {
842                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
843                 goto finish;
844         }
845
846         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
847                 log_error("Failed to determine current cgroup: %s", strerror(-k));
848                 goto finish;
849         }
850
851         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
852                 log_error("Failed to allocate cgroup path.");
853                 goto finish;
854         }
855
856         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
857         if (k < 0)  {
858                 log_error("Failed to create cgroup: %s", strerror(-k));
859                 goto finish;
860         }
861
862         STRV_FOREACH(controller,arg_controllers) {
863                 k = cg_create_and_attach(*controller, newcg, 0);
864                 if (k < 0)
865                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
866         }
867
868         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
869                 log_error("Failed to acquire pseudo tty: %m");
870                 goto finish;
871         }
872
873         if (!(console = ptsname(master))) {
874                 log_error("Failed to determine tty name: %m");
875                 goto finish;
876         }
877
878         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
879
880         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
881                 ioctl(master, TIOCSWINSZ, &ws);
882
883         if (unlockpt(master) < 0) {
884                 log_error("Failed to unlock tty: %m");
885                 goto finish;
886         }
887
888         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
889                 log_error("Failed to get terminal attributes: %m");
890                 goto finish;
891         }
892
893         saved_attr_valid = true;
894
895         raw_attr = saved_attr;
896         cfmakeraw(&raw_attr);
897         raw_attr.c_lflag &= ~ECHO;
898
899         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
900                 log_error("Failed to set terminal attributes: %m");
901                 goto finish;
902         }
903
904         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
905                 log_error("Failed to create kmsg socket pair");
906                 goto finish;
907         }
908
909         assert_se(sigemptyset(&mask) == 0);
910         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
911         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
912
913         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
914         if (pid < 0) {
915                 if (errno == EINVAL)
916                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
917                 else
918                         log_error("clone() failed: %m");
919
920                 goto finish;
921         }
922
923         if (pid == 0) {
924                 /* child */
925
926                 const char *home = NULL;
927                 uid_t uid = (uid_t) -1;
928                 gid_t gid = (gid_t) -1;
929                 const char *envp[] = {
930                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
931                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
932                         NULL, /* TERM */
933                         NULL, /* HOME */
934                         NULL, /* USER */
935                         NULL, /* LOGNAME */
936                         NULL, /* container_uuid */
937                         NULL
938                 };
939
940                 envp[2] = strv_find_prefix(environ, "TERM=");
941
942                 close_nointr_nofail(master);
943
944                 close_nointr(STDIN_FILENO);
945                 close_nointr(STDOUT_FILENO);
946                 close_nointr(STDERR_FILENO);
947
948                 close_all_fds(&kmsg_socket_pair[1], 1);
949
950                 reset_all_signal_handlers();
951
952                 assert_se(sigemptyset(&mask) == 0);
953                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
954
955                 if (setsid() < 0)
956                         goto child_fail;
957
958                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
959                         goto child_fail;
960
961                 /* Mark / as private, in case somebody marked it shared */
962                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
963                         goto child_fail;
964
965                 /* Turn directory into bind mount */
966                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
967                         log_error("Failed to make bind mount.");
968                         goto child_fail;
969                 }
970
971                 if (arg_read_only)
972                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
973                                 log_error("Failed to make read-only.");
974                                 goto child_fail;
975                         }
976
977                 if (mount_all(arg_directory) < 0)
978                         goto child_fail;
979
980                 if (copy_devnodes(arg_directory) < 0)
981                         goto child_fail;
982
983                 if (setup_dev_console(arg_directory, console) < 0)
984                         goto child_fail;
985
986                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
987                         goto child_fail;
988
989                 close_nointr_nofail(kmsg_socket_pair[1]);
990
991                 if (setup_timezone(arg_directory) < 0)
992                         goto child_fail;
993
994                 if (setup_resolv_conf(arg_directory) < 0)
995                         goto child_fail;
996
997                 if (chdir(arg_directory) < 0) {
998                         log_error("chdir(%s) failed: %m", arg_directory);
999                         goto child_fail;
1000                 }
1001
1002                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1003                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1004                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1005                         goto child_fail;
1006
1007                 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1008                         log_error("mount(MS_BIND) failed: %m");
1009                         goto child_fail;
1010                 }
1011
1012                 if (chroot(".") < 0) {
1013                         log_error("chroot() failed: %m");
1014                         goto child_fail;
1015                 }
1016
1017                 if (chdir("/") < 0) {
1018                         log_error("chdir() failed: %m");
1019                         goto child_fail;
1020                 }
1021
1022                 umask(0022);
1023
1024                 loopback_setup();
1025
1026                 if (drop_capabilities() < 0) {
1027                         log_error("drop_capabilities() failed: %m");
1028                         goto child_fail;
1029                 }
1030
1031                 if (arg_user) {
1032
1033                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1034                                 log_error("get_user_creds() failed: %m");
1035                                 goto child_fail;
1036                         }
1037
1038                         if (mkdir_parents(home, 0775) < 0) {
1039                                 log_error("mkdir_parents() failed: %m");
1040                                 goto child_fail;
1041                         }
1042
1043                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
1044                                 log_error("safe_mkdir() failed: %m");
1045                                 goto child_fail;
1046                         }
1047
1048                         if (initgroups((const char*)arg_user, gid) < 0) {
1049                                 log_error("initgroups() failed: %m");
1050                                 goto child_fail;
1051                         }
1052
1053                         if (setresgid(gid, gid, gid) < 0) {
1054                                 log_error("setregid() failed: %m");
1055                                 goto child_fail;
1056                         }
1057
1058                         if (setresuid(uid, uid, uid) < 0) {
1059                                 log_error("setreuid() failed: %m");
1060                                 goto child_fail;
1061                         }
1062                 }
1063
1064                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1065                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1066                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1067                     log_error("Out of memory");
1068                     goto child_fail;
1069                 }
1070
1071                 if (arg_uuid) {
1072                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1073                                 log_error("Out of memory");
1074                                 goto child_fail;
1075                         }
1076                 }
1077
1078                 setup_hostname();
1079
1080                 if (arg_boot) {
1081                         char **a;
1082                         size_t l;
1083
1084                         /* Automatically search for the init system */
1085
1086                         l = 1 + argc - optind;
1087                         a = newa(char*, l + 1);
1088                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1089
1090                         a[0] = (char*) "/usr/lib/systemd/systemd";
1091                         execve(a[0], a, (char**) envp);
1092
1093                         a[0] = (char*) "/lib/systemd/systemd";
1094                         execve(a[0], a, (char**) envp);
1095
1096                         a[0] = (char*) "/sbin/init";
1097                         execve(a[0], a, (char**) envp);
1098                 } else if (argc > optind)
1099                         execvpe(argv[optind], argv + optind, (char**) envp);
1100                 else {
1101                         chdir(home ? home : "/root");
1102                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1103                 }
1104
1105                 log_error("execv() failed: %m");
1106
1107         child_fail:
1108                 _exit(EXIT_FAILURE);
1109         }
1110
1111         if (process_pty(master, &mask) < 0)
1112                 goto finish;
1113
1114         if (saved_attr_valid) {
1115                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1116                 saved_attr_valid = false;
1117         }
1118
1119         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1120
1121         if (r < 0)
1122                 r = EXIT_FAILURE;
1123
1124 finish:
1125         if (saved_attr_valid)
1126                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1127
1128         if (master >= 0)
1129                 close_nointr_nofail(master);
1130
1131         close_pipe(kmsg_socket_pair);
1132
1133         if (oldcg)
1134                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1135
1136         if (newcg)
1137                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1138
1139         free(arg_directory);
1140         strv_free(arg_controllers);
1141         free(oldcg);
1142         free(newcg);
1143
1144         return r;
1145 }