chiark / gitweb /
31e8b015df2d1895b7974f7d1d53873dc2735773
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "path-util.h"
54 #include "loopback-setup.h"
55
56 static char *arg_directory = NULL;
57 static char *arg_user = NULL;
58 static char **arg_controllers = NULL;
59 static char *arg_uuid = NULL;
60 static bool arg_private_network = false;
61 static bool arg_read_only = false;
62 static bool arg_boot = false;
63
64 static int help(void) {
65
66         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
67                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
68                "  -h --help             Show this help\n"
69                "  -D --directory=NAME   Root directory for the container\n"
70                "  -b --boot             Boot up full system (i.e. invoke init)\n"
71                "  -u --user=USER        Run the command under specified user or uid\n"
72                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
73                "     --uuid=UUID        Set a specific machine UUID for the container\n"
74                "     --private-network  Disable network in container\n"
75                "     --read-only        Mount the root directory read-only\n",
76                program_invocation_short_name);
77
78         return 0;
79 }
80
81 static int parse_argv(int argc, char *argv[]) {
82
83         enum {
84                 ARG_PRIVATE_NETWORK = 0x100,
85                 ARG_UUID,
86                 ARG_READ_ONLY
87         };
88
89         static const struct option options[] = {
90                 { "help",            no_argument,       NULL, 'h'                 },
91                 { "directory",       required_argument, NULL, 'D'                 },
92                 { "user",            required_argument, NULL, 'u'                 },
93                 { "controllers",     required_argument, NULL, 'C'                 },
94                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
95                 { "boot",            no_argument,       NULL, 'b'                 },
96                 { "uuid",            required_argument, NULL, ARG_UUID            },
97                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
98                 { NULL,              0,                 NULL, 0                   }
99         };
100
101         int c;
102
103         assert(argc >= 0);
104         assert(argv);
105
106         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
107
108                 switch (c) {
109
110                 case 'h':
111                         help();
112                         return 0;
113
114                 case 'D':
115                         free(arg_directory);
116                         arg_directory = canonicalize_file_name(optarg);
117                         if (!arg_directory) {
118                                 log_error("Failed to canonicalize root directory.");
119                                 return -ENOMEM;
120                         }
121
122                         break;
123
124                 case 'u':
125                         free(arg_user);
126                         if (!(arg_user = strdup(optarg))) {
127                                 log_error("Failed to duplicate user name.");
128                                 return -ENOMEM;
129                         }
130
131                         break;
132
133                 case 'C':
134                         strv_free(arg_controllers);
135                         arg_controllers = strv_split(optarg, ",");
136                         if (!arg_controllers) {
137                                 log_error("Failed to split controllers list.");
138                                 return -ENOMEM;
139                         }
140                         strv_uniq(arg_controllers);
141
142                         break;
143
144                 case ARG_PRIVATE_NETWORK:
145                         arg_private_network = true;
146                         break;
147
148                 case 'b':
149                         arg_boot = true;
150                         break;
151
152                 case ARG_UUID:
153                         arg_uuid = optarg;
154                         break;
155
156                 case ARG_READ_ONLY:
157                         arg_read_only = true;
158                         break;
159
160                 case '?':
161                         return -EINVAL;
162
163                 default:
164                         log_error("Unknown option code %c", c);
165                         return -EINVAL;
166                 }
167         }
168
169         return 1;
170 }
171
172 static int mount_all(const char *dest) {
173
174         typedef struct MountPoint {
175                 const char *what;
176                 const char *where;
177                 const char *type;
178                 const char *options;
179                 unsigned long flags;
180                 bool fatal;
181         } MountPoint;
182
183         static const MountPoint mount_table[] = {
184                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
185                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
186                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
187                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
188                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
189                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
190                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
191                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
192 #ifdef HAVE_SELINUX
193                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
194                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
195 #endif
196         };
197
198         unsigned k;
199         int r = 0;
200         char *where;
201
202         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
203                 int t;
204
205                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
206                         log_error("Out of memory");
207
208                         if (r == 0)
209                                 r = -ENOMEM;
210
211                         break;
212                 }
213
214                 t = path_is_mount_point(where, false);
215                 if (t < 0) {
216                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
217                         free(where);
218
219                         if (r == 0)
220                                 r = t;
221
222                         continue;
223                 }
224
225                 mkdir_p(where, 0755);
226
227                 if (mount(mount_table[k].what,
228                           where,
229                           mount_table[k].type,
230                           mount_table[k].flags,
231                           mount_table[k].options) < 0 &&
232                     mount_table[k].fatal) {
233
234                         log_error("mount(%s) failed: %m", where);
235
236                         if (r == 0)
237                                 r = -errno;
238                 }
239
240                 free(where);
241         }
242
243         return r;
244 }
245
246 static int setup_timezone(const char *dest) {
247         char *where;
248
249         assert(dest);
250
251         /* Fix the timezone, if possible */
252         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
253                 log_error("Out of memory");
254                 return -ENOMEM;
255         }
256
257         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
258                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
259
260         free(where);
261
262         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
263                 log_error("Out of memory");
264                 return -ENOMEM;
265         }
266
267         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
268                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
269
270         free(where);
271
272         return 0;
273 }
274
275 static int setup_resolv_conf(const char *dest) {
276         char *where;
277
278         assert(dest);
279
280         if (arg_private_network)
281                 return 0;
282
283         /* Fix resolv.conf, if possible */
284         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
285                 log_error("Out of memory");
286                 return -ENOMEM;
287         }
288
289         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
290                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
291
292         free(where);
293
294         return 0;
295 }
296
297 static int copy_devnodes(const char *dest) {
298
299         static const char devnodes[] =
300                 "null\0"
301                 "zero\0"
302                 "full\0"
303                 "random\0"
304                 "urandom\0"
305                 "tty\0"
306                 "ptmx\0"
307                 "rtc0\0";
308
309         const char *d;
310         int r = 0;
311         mode_t u;
312
313         assert(dest);
314
315         u = umask(0000);
316
317         NULSTR_FOREACH(d, devnodes) {
318                 struct stat st;
319                 char *from = NULL, *to = NULL;
320
321                 asprintf(&from, "/dev/%s", d);
322                 asprintf(&to, "%s/dev/%s", dest, d);
323
324                 if (!from || !to) {
325                         log_error("Failed to allocate devnode path");
326
327                         free(from);
328                         free(to);
329
330                         from = to = NULL;
331
332                         if (r == 0)
333                                 r = -ENOMEM;
334
335                         break;
336                 }
337
338                 if (stat(from, &st) < 0) {
339
340                         if (errno != ENOENT) {
341                                 log_error("Failed to stat %s: %m", from);
342                                 if (r == 0)
343                                         r = -errno;
344                         }
345
346                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
347
348                         log_error("%s is not a char or block device, cannot copy.", from);
349                         if (r == 0)
350                                 r = -EIO;
351
352                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
353
354                         log_error("mknod(%s) failed: %m", dest);
355                         if (r == 0)
356                                 r = -errno;
357                 }
358
359                 free(from);
360                 free(to);
361         }
362
363         umask(u);
364
365         return r;
366 }
367
368 static int setup_dev_console(const char *dest, const char *console) {
369         struct stat st;
370         char *to = NULL;
371         int r;
372         mode_t u;
373
374         assert(dest);
375         assert(console);
376
377         u = umask(0000);
378
379         if (stat(console, &st) < 0) {
380                 log_error("Failed to stat %s: %m", console);
381                 r = -errno;
382                 goto finish;
383
384         } else if (!S_ISCHR(st.st_mode)) {
385                 log_error("/dev/console is not a char device.");
386                 r = -EIO;
387                 goto finish;
388         }
389
390         r = chmod_and_chown(console, 0600, 0, 0);
391         if (r < 0) {
392                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
393                 goto finish;
394         }
395
396         if (asprintf(&to, "%s/dev/console", dest) < 0) {
397                 log_error("Out of memory");
398                 r = -ENOMEM;
399                 goto finish;
400         }
401
402         /* We need to bind mount the right tty to /dev/console since
403          * ptys can only exist on pts file systems. To have something
404          * to bind mount things on we create a device node first, that
405          * has the right major/minor (note that the major minor
406          * doesn't actually matter here, since we mount it over
407          * anyway). */
408
409         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
410                 log_error("mknod() for /dev/console failed: %m");
411                 r = -errno;
412                 goto finish;
413         }
414
415         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
416                 log_error("Bind mount for /dev/console failed: %m");
417                 r = -errno;
418                 goto finish;
419         }
420
421 finish:
422         free(to);
423         umask(u);
424
425         return r;
426 }
427
428 static int setup_kmsg(const char *dest, int kmsg_socket) {
429         char *from = NULL, *to = NULL;
430         int r, fd, k;
431         mode_t u;
432         union {
433                 struct cmsghdr cmsghdr;
434                 uint8_t buf[CMSG_SPACE(sizeof(int))];
435         } control;
436         struct msghdr mh;
437         struct cmsghdr *cmsg;
438
439         assert(dest);
440         assert(kmsg_socket >= 0);
441
442         u = umask(0000);
443
444         /* We create the kmsg FIFO as /dev/kmsg, but immediately
445          * delete it after bind mounting it to /proc/kmsg. While FIFOs
446          * on the reading side behave very similar to /proc/kmsg,
447          * their writing side behaves differently from /dev/kmsg in
448          * that writing blocks when nothing is reading. In order to
449          * avoid any problems with containers deadlocking due to this
450          * we simply make /dev/kmsg unavailable to the container. */
451         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
452                 log_error("Out of memory");
453                 r = -ENOMEM;
454                 goto finish;
455         }
456
457         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
458                 log_error("Out of memory");
459                 r = -ENOMEM;
460                 goto finish;
461         }
462
463         if (mkfifo(from, 0600) < 0) {
464                 log_error("mkfifo() for /dev/kmsg failed: %m");
465                 r = -errno;
466                 goto finish;
467         }
468
469         r = chmod_and_chown(from, 0600, 0, 0);
470         if (r < 0) {
471                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
472                 goto finish;
473         }
474
475         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
476                 log_error("Bind mount for /proc/kmsg failed: %m");
477                 r = -errno;
478                 goto finish;
479         }
480
481         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
482         if (fd < 0) {
483                 log_error("Failed to open fifo: %m");
484                 r = -errno;
485                 goto finish;
486         }
487
488         zero(mh);
489         zero(control);
490
491         mh.msg_control = &control;
492         mh.msg_controllen = sizeof(control);
493
494         cmsg = CMSG_FIRSTHDR(&mh);
495         cmsg->cmsg_level = SOL_SOCKET;
496         cmsg->cmsg_type = SCM_RIGHTS;
497         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
498         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
499
500         mh.msg_controllen = cmsg->cmsg_len;
501
502         /* Store away the fd in the socket, so that it stays open as
503          * long as we run the child */
504         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
505         close_nointr_nofail(fd);
506
507         if (k < 0) {
508                 log_error("Failed to send FIFO fd: %m");
509                 r = -errno;
510                 goto finish;
511         }
512
513         /* And now make the FIFO unavailable as /dev/kmsg... */
514         unlink(from);
515
516 finish:
517         free(from);
518         free(to);
519         umask(u);
520
521         return r;
522 }
523
524 static int setup_hostname(void) {
525         char *hn;
526         int r = 0;
527
528         hn = path_get_file_name(arg_directory);
529         if (hn) {
530                 hn = strdup(hn);
531                 if (!hn)
532                         return -ENOMEM;
533
534                 hostname_cleanup(hn);
535
536                 if (!isempty(hn))
537                         if (sethostname(hn, strlen(hn)) < 0)
538                                 r = -errno;
539
540                 free(hn);
541         }
542
543         return r;
544 }
545
546 static int drop_capabilities(void) {
547         static const unsigned long retain[] = {
548                 CAP_CHOWN,
549                 CAP_DAC_OVERRIDE,
550                 CAP_DAC_READ_SEARCH,
551                 CAP_FOWNER,
552                 CAP_FSETID,
553                 CAP_IPC_OWNER,
554                 CAP_KILL,
555                 CAP_LEASE,
556                 CAP_LINUX_IMMUTABLE,
557                 CAP_NET_BIND_SERVICE,
558                 CAP_NET_BROADCAST,
559                 CAP_NET_RAW,
560                 CAP_SETGID,
561                 CAP_SETFCAP,
562                 CAP_SETPCAP,
563                 CAP_SETUID,
564                 CAP_SYS_ADMIN,
565                 CAP_SYS_CHROOT,
566                 CAP_SYS_NICE,
567                 CAP_SYS_PTRACE,
568                 CAP_SYS_TTY_CONFIG
569         };
570
571         unsigned long l;
572
573         for (l = 0; l <= cap_last_cap(); l++) {
574                 unsigned i;
575
576                 for (i = 0; i < ELEMENTSOF(retain); i++)
577                         if (retain[i] == l)
578                                 break;
579
580                 if (i < ELEMENTSOF(retain))
581                         continue;
582
583                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
584                         log_error("PR_CAPBSET_DROP failed: %m");
585                         return -errno;
586                 }
587         }
588
589         return 0;
590 }
591
592 static int is_os_tree(const char *path) {
593         int r;
594         char *p;
595         /* We use /bin/sh as flag file if something is an OS */
596
597         if (asprintf(&p, "%s/bin/sh", path) < 0)
598                 return -ENOMEM;
599
600         r = access(p, F_OK);
601         free(p);
602
603         return r < 0 ? 0 : 1;
604 }
605
606 static int process_pty(int master, sigset_t *mask) {
607
608         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
609         size_t in_buffer_full = 0, out_buffer_full = 0;
610         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
611         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
612         int ep = -1, signal_fd = -1, r;
613
614         fd_nonblock(STDIN_FILENO, 1);
615         fd_nonblock(STDOUT_FILENO, 1);
616         fd_nonblock(master, 1);
617
618         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
619                 log_error("signalfd(): %m");
620                 r = -errno;
621                 goto finish;
622         }
623
624         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
625                 log_error("Failed to create epoll: %m");
626                 r = -errno;
627                 goto finish;
628         }
629
630         zero(stdin_ev);
631         stdin_ev.events = EPOLLIN|EPOLLET;
632         stdin_ev.data.fd = STDIN_FILENO;
633
634         zero(stdout_ev);
635         stdout_ev.events = EPOLLOUT|EPOLLET;
636         stdout_ev.data.fd = STDOUT_FILENO;
637
638         zero(master_ev);
639         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
640         master_ev.data.fd = master;
641
642         zero(signal_ev);
643         signal_ev.events = EPOLLIN;
644         signal_ev.data.fd = signal_fd;
645
646         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
647             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
648             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
649             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
650                 log_error("Failed to regiser fds in epoll: %m");
651                 r = -errno;
652                 goto finish;
653         }
654
655         for (;;) {
656                 struct epoll_event ev[16];
657                 ssize_t k;
658                 int i, nfds;
659
660                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
661
662                         if (errno == EINTR || errno == EAGAIN)
663                                 continue;
664
665                         log_error("epoll_wait(): %m");
666                         r = -errno;
667                         goto finish;
668                 }
669
670                 assert(nfds >= 1);
671
672                 for (i = 0; i < nfds; i++) {
673                         if (ev[i].data.fd == STDIN_FILENO) {
674
675                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
676                                         stdin_readable = true;
677
678                         } else if (ev[i].data.fd == STDOUT_FILENO) {
679
680                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
681                                         stdout_writable = true;
682
683                         } else if (ev[i].data.fd == master) {
684
685                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
686                                         master_readable = true;
687
688                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
689                                         master_writable = true;
690
691                         } else if (ev[i].data.fd == signal_fd) {
692                                 struct signalfd_siginfo sfsi;
693                                 ssize_t n;
694
695                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
696
697                                         if (n >= 0) {
698                                                 log_error("Failed to read from signalfd: invalid block size");
699                                                 r = -EIO;
700                                                 goto finish;
701                                         }
702
703                                         if (errno != EINTR && errno != EAGAIN) {
704                                                 log_error("Failed to read from signalfd: %m");
705                                                 r = -errno;
706                                                 goto finish;
707                                         }
708                                 } else {
709
710                                         if (sfsi.ssi_signo == SIGWINCH) {
711                                                 struct winsize ws;
712
713                                                 /* The window size changed, let's forward that. */
714                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
715                                                         ioctl(master, TIOCSWINSZ, &ws);
716                                         } else {
717                                                 r = 0;
718                                                 goto finish;
719                                         }
720                                 }
721                         }
722                 }
723
724                 while ((stdin_readable && in_buffer_full <= 0) ||
725                        (master_writable && in_buffer_full > 0) ||
726                        (master_readable && out_buffer_full <= 0) ||
727                        (stdout_writable && out_buffer_full > 0)) {
728
729                         if (stdin_readable && in_buffer_full < LINE_MAX) {
730
731                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
732
733                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
734                                                 stdin_readable = false;
735                                         else {
736                                                 log_error("read(): %m");
737                                                 r = -errno;
738                                                 goto finish;
739                                         }
740                                 } else
741                                         in_buffer_full += (size_t) k;
742                         }
743
744                         if (master_writable && in_buffer_full > 0) {
745
746                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
747
748                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
749                                                 master_writable = false;
750                                         else {
751                                                 log_error("write(): %m");
752                                                 r = -errno;
753                                                 goto finish;
754                                         }
755
756                                 } else {
757                                         assert(in_buffer_full >= (size_t) k);
758                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
759                                         in_buffer_full -= k;
760                                 }
761                         }
762
763                         if (master_readable && out_buffer_full < LINE_MAX) {
764
765                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
766
767                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
768                                                 master_readable = false;
769                                         else {
770                                                 log_error("read(): %m");
771                                                 r = -errno;
772                                                 goto finish;
773                                         }
774                                 }  else
775                                         out_buffer_full += (size_t) k;
776                         }
777
778                         if (stdout_writable && out_buffer_full > 0) {
779
780                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
781
782                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
783                                                 stdout_writable = false;
784                                         else {
785                                                 log_error("write(): %m");
786                                                 r = -errno;
787                                                 goto finish;
788                                         }
789
790                                 } else {
791                                         assert(out_buffer_full >= (size_t) k);
792                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
793                                         out_buffer_full -= k;
794                                 }
795                         }
796                 }
797         }
798
799 finish:
800         if (ep >= 0)
801                 close_nointr_nofail(ep);
802
803         if (signal_fd >= 0)
804                 close_nointr_nofail(signal_fd);
805
806         return r;
807 }
808
809 int main(int argc, char *argv[]) {
810         pid_t pid = 0;
811         int r = EXIT_FAILURE, k;
812         char *oldcg = NULL, *newcg = NULL;
813         char **controller = NULL;
814         int master = -1;
815         const char *console = NULL;
816         struct termios saved_attr, raw_attr;
817         sigset_t mask;
818         bool saved_attr_valid = false;
819         struct winsize ws;
820         int kmsg_socket_pair[2] = { -1, -1 };
821
822         log_parse_environment();
823         log_open();
824
825         if ((r = parse_argv(argc, argv)) <= 0)
826                 goto finish;
827
828         if (arg_directory) {
829                 char *p;
830
831                 p = path_make_absolute_cwd(arg_directory);
832                 free(arg_directory);
833                 arg_directory = p;
834         } else
835                 arg_directory = get_current_dir_name();
836
837         if (!arg_directory) {
838                 log_error("Failed to determine path");
839                 goto finish;
840         }
841
842         path_kill_slashes(arg_directory);
843
844         if (geteuid() != 0) {
845                 log_error("Need to be root.");
846                 goto finish;
847         }
848
849         if (sd_booted() <= 0) {
850                 log_error("Not running on a systemd system.");
851                 goto finish;
852         }
853
854         if (path_equal(arg_directory, "/")) {
855                 log_error("Spawning container on root directory not supported.");
856                 goto finish;
857         }
858
859         if (is_os_tree(arg_directory) <= 0) {
860                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
861                 goto finish;
862         }
863
864         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
865                 log_error("Failed to determine current cgroup: %s", strerror(-k));
866                 goto finish;
867         }
868
869         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
870                 log_error("Failed to allocate cgroup path.");
871                 goto finish;
872         }
873
874         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
875         if (k < 0)  {
876                 log_error("Failed to create cgroup: %s", strerror(-k));
877                 goto finish;
878         }
879
880         STRV_FOREACH(controller,arg_controllers) {
881                 k = cg_create_and_attach(*controller, newcg, 0);
882                 if (k < 0)
883                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
884         }
885
886         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
887                 log_error("Failed to acquire pseudo tty: %m");
888                 goto finish;
889         }
890
891         if (!(console = ptsname(master))) {
892                 log_error("Failed to determine tty name: %m");
893                 goto finish;
894         }
895
896         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
897
898         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
899                 ioctl(master, TIOCSWINSZ, &ws);
900
901         if (unlockpt(master) < 0) {
902                 log_error("Failed to unlock tty: %m");
903                 goto finish;
904         }
905
906         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
907                 log_error("Failed to get terminal attributes: %m");
908                 goto finish;
909         }
910
911         saved_attr_valid = true;
912
913         raw_attr = saved_attr;
914         cfmakeraw(&raw_attr);
915         raw_attr.c_lflag &= ~ECHO;
916
917         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
918                 log_error("Failed to set terminal attributes: %m");
919                 goto finish;
920         }
921
922         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
923                 log_error("Failed to create kmsg socket pair");
924                 goto finish;
925         }
926
927         assert_se(sigemptyset(&mask) == 0);
928         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
929         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
930
931         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
932         if (pid < 0) {
933                 if (errno == EINVAL)
934                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
935                 else
936                         log_error("clone() failed: %m");
937
938                 goto finish;
939         }
940
941         if (pid == 0) {
942                 /* child */
943
944                 const char *home = NULL;
945                 uid_t uid = (uid_t) -1;
946                 gid_t gid = (gid_t) -1;
947                 const char *envp[] = {
948                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
949                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
950                         NULL, /* TERM */
951                         NULL, /* HOME */
952                         NULL, /* USER */
953                         NULL, /* LOGNAME */
954                         NULL, /* container_uuid */
955                         NULL
956                 };
957
958                 envp[2] = strv_find_prefix(environ, "TERM=");
959
960                 close_nointr_nofail(master);
961
962                 close_nointr(STDIN_FILENO);
963                 close_nointr(STDOUT_FILENO);
964                 close_nointr(STDERR_FILENO);
965
966                 close_all_fds(&kmsg_socket_pair[1], 1);
967
968                 reset_all_signal_handlers();
969
970                 assert_se(sigemptyset(&mask) == 0);
971                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
972
973                 if (setsid() < 0)
974                         goto child_fail;
975
976                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
977                         goto child_fail;
978
979                 /* Mark / as private, in case somebody marked it shared */
980                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
981                         goto child_fail;
982
983                 /* Turn directory into bind mount */
984                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
985                         log_error("Failed to make bind mount.");
986                         goto child_fail;
987                 }
988
989                 if (arg_read_only)
990                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
991                                 log_error("Failed to make read-only.");
992                                 goto child_fail;
993                         }
994
995                 if (mount_all(arg_directory) < 0)
996                         goto child_fail;
997
998                 if (copy_devnodes(arg_directory) < 0)
999                         goto child_fail;
1000
1001                 if (setup_dev_console(arg_directory, console) < 0)
1002                         goto child_fail;
1003
1004                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1005                         goto child_fail;
1006
1007                 close_nointr_nofail(kmsg_socket_pair[1]);
1008
1009                 if (setup_timezone(arg_directory) < 0)
1010                         goto child_fail;
1011
1012                 if (setup_resolv_conf(arg_directory) < 0)
1013                         goto child_fail;
1014
1015                 if (chdir(arg_directory) < 0) {
1016                         log_error("chdir(%s) failed: %m", arg_directory);
1017                         goto child_fail;
1018                 }
1019
1020                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1021                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1022                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1023                         goto child_fail;
1024
1025                 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1026                         log_error("mount(MS_BIND) failed: %m");
1027                         goto child_fail;
1028                 }
1029
1030                 if (chroot(".") < 0) {
1031                         log_error("chroot() failed: %m");
1032                         goto child_fail;
1033                 }
1034
1035                 if (chdir("/") < 0) {
1036                         log_error("chdir() failed: %m");
1037                         goto child_fail;
1038                 }
1039
1040                 umask(0022);
1041
1042                 loopback_setup();
1043
1044                 if (drop_capabilities() < 0)
1045                         goto child_fail;
1046
1047                 if (arg_user) {
1048
1049                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1050                                 log_error("get_user_creds() failed: %m");
1051                                 goto child_fail;
1052                         }
1053
1054                         if (mkdir_parents(home, 0775) < 0) {
1055                                 log_error("mkdir_parents() failed: %m");
1056                                 goto child_fail;
1057                         }
1058
1059                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
1060                                 log_error("safe_mkdir() failed: %m");
1061                                 goto child_fail;
1062                         }
1063
1064                         if (initgroups((const char*)arg_user, gid) < 0) {
1065                                 log_error("initgroups() failed: %m");
1066                                 goto child_fail;
1067                         }
1068
1069                         if (setresgid(gid, gid, gid) < 0) {
1070                                 log_error("setregid() failed: %m");
1071                                 goto child_fail;
1072                         }
1073
1074                         if (setresuid(uid, uid, uid) < 0) {
1075                                 log_error("setreuid() failed: %m");
1076                                 goto child_fail;
1077                         }
1078                 }
1079
1080                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1081                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1082                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1083                     log_error("Out of memory");
1084                     goto child_fail;
1085                 }
1086
1087                 if (arg_uuid) {
1088                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1089                                 log_error("Out of memory");
1090                                 goto child_fail;
1091                         }
1092                 }
1093
1094                 setup_hostname();
1095
1096                 if (arg_boot) {
1097                         char **a;
1098                         size_t l;
1099
1100                         /* Automatically search for the init system */
1101
1102                         l = 1 + argc - optind;
1103                         a = newa(char*, l + 1);
1104                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1105
1106                         a[0] = (char*) "/usr/lib/systemd/systemd";
1107                         execve(a[0], a, (char**) envp);
1108
1109                         a[0] = (char*) "/lib/systemd/systemd";
1110                         execve(a[0], a, (char**) envp);
1111
1112                         a[0] = (char*) "/sbin/init";
1113                         execve(a[0], a, (char**) envp);
1114                 } else if (argc > optind)
1115                         execvpe(argv[optind], argv + optind, (char**) envp);
1116                 else {
1117                         chdir(home ? home : "/root");
1118                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1119                 }
1120
1121                 log_error("execv() failed: %m");
1122
1123         child_fail:
1124                 _exit(EXIT_FAILURE);
1125         }
1126
1127         if (process_pty(master, &mask) < 0)
1128                 goto finish;
1129
1130         if (saved_attr_valid) {
1131                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1132                 saved_attr_valid = false;
1133         }
1134
1135         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1136
1137         if (r < 0)
1138                 r = EXIT_FAILURE;
1139
1140 finish:
1141         if (saved_attr_valid)
1142                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1143
1144         if (master >= 0)
1145                 close_nointr_nofail(master);
1146
1147         close_pipe(kmsg_socket_pair);
1148
1149         if (oldcg)
1150                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1151
1152         if (newcg)
1153                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1154
1155         free(arg_directory);
1156         strv_free(arg_controllers);
1157         free(oldcg);
1158         free(newcg);
1159
1160         return r;
1161 }