chiark / gitweb /
nspawn: add -b switch to automatically look for an init binary
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
59 static bool arg_boot = false;
60
61 static int help(void) {
62
63         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
64                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
65                "  -h --help             Show this help\n"
66                "  -D --directory=NAME   Root directory for the container\n"
67                "  -b --boot             Boot up full system (i.e. invoke init)\n"
68                "  -u --user=USER        Run the command under specified user or uid\n"
69                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
70                "     --private-network  Disable network in container\n",
71                program_invocation_short_name);
72
73         return 0;
74 }
75
76 static int parse_argv(int argc, char *argv[]) {
77
78         enum {
79                 ARG_PRIVATE_NETWORK = 0x100
80         };
81
82         static const struct option options[] = {
83                 { "help",            no_argument,       NULL, 'h'                 },
84                 { "directory",       required_argument, NULL, 'D'                 },
85                 { "user",            required_argument, NULL, 'u'                 },
86                 { "controllers",     required_argument, NULL, 'C'                 },
87                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
88                 { "boot",            no_argument,       NULL, 'b'                 },
89                 { NULL,              0,                 NULL, 0                   }
90         };
91
92         int c;
93
94         assert(argc >= 0);
95         assert(argv);
96
97         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
98
99                 switch (c) {
100
101                 case 'h':
102                         help();
103                         return 0;
104
105                 case 'D':
106                         free(arg_directory);
107                         arg_directory = canonicalize_file_name(optarg);
108                         if (!arg_directory) {
109                                 log_error("Failed to canonicalize root directory.");
110                                 return -ENOMEM;
111                         }
112
113                         break;
114
115                 case 'u':
116                         free(arg_user);
117                         if (!(arg_user = strdup(optarg))) {
118                                 log_error("Failed to duplicate user name.");
119                                 return -ENOMEM;
120                         }
121
122                         break;
123
124                 case 'C':
125                         strv_free(arg_controllers);
126                         arg_controllers = strv_split(optarg, ",");
127                         if (!arg_controllers) {
128                                 log_error("Failed to split controllers list.");
129                                 return -ENOMEM;
130                         }
131                         strv_uniq(arg_controllers);
132
133                         break;
134
135                 case ARG_PRIVATE_NETWORK:
136                         arg_private_network = true;
137                         break;
138
139                 case 'b':
140                         arg_boot = true;
141                         break;
142
143                 case '?':
144                         return -EINVAL;
145
146                 default:
147                         log_error("Unknown option code %c", c);
148                         return -EINVAL;
149                 }
150         }
151
152         return 1;
153 }
154
155 static int mount_all(const char *dest) {
156
157         typedef struct MountPoint {
158                 const char *what;
159                 const char *where;
160                 const char *type;
161                 const char *options;
162                 unsigned long flags;
163                 bool fatal;
164         } MountPoint;
165
166         static const MountPoint mount_table[] = {
167                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
168                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
169                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
170                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
171                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
172                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
173                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
174                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
175 #ifdef HAVE_SELINUX
176                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
177                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
178 #endif
179         };
180
181         unsigned k;
182         int r = 0;
183         char *where;
184
185         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
186                 int t;
187
188                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
189                         log_error("Out of memory");
190
191                         if (r == 0)
192                                 r = -ENOMEM;
193
194                         break;
195                 }
196
197                 t = path_is_mount_point(where, false);
198                 if (t < 0) {
199                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
200                         free(where);
201
202                         if (r == 0)
203                                 r = t;
204
205                         continue;
206                 }
207
208                 mkdir_p(where, 0755);
209
210                 if (mount(mount_table[k].what,
211                           where,
212                           mount_table[k].type,
213                           mount_table[k].flags,
214                           mount_table[k].options) < 0 &&
215                     mount_table[k].fatal) {
216
217                         log_error("mount(%s) failed: %m", where);
218
219                         if (r == 0)
220                                 r = -errno;
221                 }
222
223                 free(where);
224         }
225
226         return r;
227 }
228
229 static int setup_timezone(const char *dest) {
230         char *where;
231
232         assert(dest);
233
234         /* Fix the timezone, if possible */
235         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
236                 log_error("Out of memory");
237                 return -ENOMEM;
238         }
239
240         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
241                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
242
243         free(where);
244
245         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
246                 log_error("Out of memory");
247                 return -ENOMEM;
248         }
249
250         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
251                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
252
253         free(where);
254
255         return 0;
256 }
257
258 static int copy_devnodes(const char *dest) {
259
260         static const char devnodes[] =
261                 "null\0"
262                 "zero\0"
263                 "full\0"
264                 "random\0"
265                 "urandom\0"
266                 "tty\0"
267                 "ptmx\0"
268                 "rtc0\0";
269
270         const char *d;
271         int r = 0;
272         mode_t u;
273
274         assert(dest);
275
276         u = umask(0000);
277
278         NULSTR_FOREACH(d, devnodes) {
279                 struct stat st;
280                 char *from = NULL, *to = NULL;
281
282                 asprintf(&from, "/dev/%s", d);
283                 asprintf(&to, "%s/dev/%s", dest, d);
284
285                 if (!from || !to) {
286                         log_error("Failed to allocate devnode path");
287
288                         free(from);
289                         free(to);
290
291                         from = to = NULL;
292
293                         if (r == 0)
294                                 r = -ENOMEM;
295
296                         break;
297                 }
298
299                 if (stat(from, &st) < 0) {
300
301                         if (errno != ENOENT) {
302                                 log_error("Failed to stat %s: %m", from);
303                                 if (r == 0)
304                                         r = -errno;
305                         }
306
307                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
308
309                         log_error("%s is not a char or block device, cannot copy.", from);
310                         if (r == 0)
311                                 r = -EIO;
312
313                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
314
315                         log_error("mknod(%s) failed: %m", dest);
316                         if (r == 0)
317                                 r = -errno;
318                 }
319
320                 free(from);
321                 free(to);
322         }
323
324         umask(u);
325
326         return r;
327 }
328
329 static int setup_dev_console(const char *dest, const char *console) {
330         struct stat st;
331         char *to = NULL;
332         int r;
333         mode_t u;
334
335         assert(dest);
336         assert(console);
337
338         u = umask(0000);
339
340         if (stat(console, &st) < 0) {
341                 log_error("Failed to stat %s: %m", console);
342                 r = -errno;
343                 goto finish;
344
345         } else if (!S_ISCHR(st.st_mode)) {
346                 log_error("/dev/console is not a char device.");
347                 r = -EIO;
348                 goto finish;
349         }
350
351         r = chmod_and_chown(console, 0600, 0, 0);
352         if (r < 0) {
353                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
354                 goto finish;
355         }
356
357         if (asprintf(&to, "%s/dev/console", dest) < 0) {
358                 log_error("Out of memory");
359                 r = -ENOMEM;
360                 goto finish;
361         }
362
363         /* We need to bind mount the right tty to /dev/console since
364          * ptys can only exist on pts file systems. To have something
365          * to bind mount things on we create a device node first, that
366          * has the right major/minor (note that the major minor
367          * doesn't actually matter here, since we mount it over
368          * anyway). */
369
370         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
371                 log_error("mknod() for /dev/console failed: %m");
372                 r = -errno;
373                 goto finish;
374         }
375
376         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
377                 log_error("Bind mount for /dev/console failed: %m");
378                 r = -errno;
379                 goto finish;
380         }
381
382 finish:
383         free(to);
384         umask(u);
385
386         return r;
387 }
388
389 static int setup_kmsg(const char *dest, int kmsg_socket) {
390         char *from = NULL, *to = NULL;
391         int r, fd, k;
392         mode_t u;
393         union {
394                 struct cmsghdr cmsghdr;
395                 uint8_t buf[CMSG_SPACE(sizeof(int))];
396         } control;
397         struct msghdr mh;
398         struct cmsghdr *cmsg;
399
400         assert(dest);
401         assert(kmsg_socket >= 0);
402
403         u = umask(0000);
404
405         /* We create the kmsg FIFO as /dev/kmsg, but immediately
406          * delete it after bind mounting it to /proc/kmsg. While FIFOs
407          * on the reading side behave very similar to /proc/kmsg,
408          * their writing side behaves differently from /dev/kmsg in
409          * that writing blocks when nothing is reading. In order to
410          * avoid any problems with containers deadlocking due to this
411          * we simply make /dev/kmsg unavailable to the container. */
412         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
413                 log_error("Out of memory");
414                 r = -ENOMEM;
415                 goto finish;
416         }
417
418         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
419                 log_error("Out of memory");
420                 r = -ENOMEM;
421                 goto finish;
422         }
423
424         if (mkfifo(from, 0600) < 0) {
425                 log_error("mkfifo() for /dev/kmsg failed: %m");
426                 r = -errno;
427                 goto finish;
428         }
429
430         r = chmod_and_chown(from, 0600, 0, 0);
431         if (r < 0) {
432                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
433                 goto finish;
434         }
435
436         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
437                 log_error("Bind mount for /proc/kmsg failed: %m");
438                 r = -errno;
439                 goto finish;
440         }
441
442         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
443         if (fd < 0) {
444                 log_error("Failed to open fifo: %m");
445                 r = -errno;
446                 goto finish;
447         }
448
449         zero(mh);
450         zero(control);
451
452         mh.msg_control = &control;
453         mh.msg_controllen = sizeof(control);
454
455         cmsg = CMSG_FIRSTHDR(&mh);
456         cmsg->cmsg_level = SOL_SOCKET;
457         cmsg->cmsg_type = SCM_RIGHTS;
458         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
459         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
460
461         mh.msg_controllen = cmsg->cmsg_len;
462
463         /* Store away the fd in the socket, so that it stays open as
464          * long as we run the child */
465         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
466         close_nointr_nofail(fd);
467
468         if (k < 0) {
469                 log_error("Failed to send FIFO fd: %m");
470                 r = -errno;
471                 goto finish;
472         }
473
474         /* And now make the FIFO unavailable as /dev/kmsg... */
475         unlink(from);
476
477 finish:
478         free(from);
479         free(to);
480         umask(u);
481
482         return r;
483 }
484
485 static int setup_hostname(void) {
486         char *hn;
487         int r = 0;
488
489         hn = file_name_from_path(arg_directory);
490         if (hn) {
491                 hn = strdup(hn);
492                 if (!hn)
493                         return -ENOMEM;
494
495                 hostname_cleanup(hn);
496
497                 if (!isempty(hn))
498                         if (sethostname(hn, strlen(hn)) < 0)
499                                 r = -errno;
500
501                 free(hn);
502         }
503
504         return r;
505 }
506
507 static int drop_capabilities(void) {
508         static const unsigned long retain[] = {
509                 CAP_CHOWN,
510                 CAP_DAC_OVERRIDE,
511                 CAP_DAC_READ_SEARCH,
512                 CAP_FOWNER,
513                 CAP_FSETID,
514                 CAP_IPC_OWNER,
515                 CAP_KILL,
516                 CAP_LEASE,
517                 CAP_LINUX_IMMUTABLE,
518                 CAP_NET_BIND_SERVICE,
519                 CAP_NET_BROADCAST,
520                 CAP_NET_RAW,
521                 CAP_SETGID,
522                 CAP_SETFCAP,
523                 CAP_SETPCAP,
524                 CAP_SETUID,
525                 CAP_SYS_ADMIN,
526                 CAP_SYS_CHROOT,
527                 CAP_SYS_NICE,
528                 CAP_SYS_PTRACE,
529                 CAP_SYS_TTY_CONFIG
530         };
531
532         unsigned long l;
533
534         for (l = 0; l <= cap_last_cap(); l++) {
535                 unsigned i;
536
537                 for (i = 0; i < ELEMENTSOF(retain); i++)
538                         if (retain[i] == l)
539                                 break;
540
541                 if (i < ELEMENTSOF(retain))
542                         continue;
543
544                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
545                         log_error("PR_CAPBSET_DROP failed: %m");
546                         return -errno;
547                 }
548         }
549
550         return 0;
551 }
552
553 static int is_os_tree(const char *path) {
554         int r;
555         char *p;
556         /* We use /bin/sh as flag file if something is an OS */
557
558         if (asprintf(&p, "%s/bin/sh", path) < 0)
559                 return -ENOMEM;
560
561         r = access(p, F_OK);
562         free(p);
563
564         return r < 0 ? 0 : 1;
565 }
566
567 static int process_pty(int master, sigset_t *mask) {
568
569         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
570         size_t in_buffer_full = 0, out_buffer_full = 0;
571         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
572         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
573         int ep = -1, signal_fd = -1, r;
574
575         fd_nonblock(STDIN_FILENO, 1);
576         fd_nonblock(STDOUT_FILENO, 1);
577         fd_nonblock(master, 1);
578
579         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
580                 log_error("signalfd(): %m");
581                 r = -errno;
582                 goto finish;
583         }
584
585         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
586                 log_error("Failed to create epoll: %m");
587                 r = -errno;
588                 goto finish;
589         }
590
591         zero(stdin_ev);
592         stdin_ev.events = EPOLLIN|EPOLLET;
593         stdin_ev.data.fd = STDIN_FILENO;
594
595         zero(stdout_ev);
596         stdout_ev.events = EPOLLOUT|EPOLLET;
597         stdout_ev.data.fd = STDOUT_FILENO;
598
599         zero(master_ev);
600         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
601         master_ev.data.fd = master;
602
603         zero(signal_ev);
604         signal_ev.events = EPOLLIN;
605         signal_ev.data.fd = signal_fd;
606
607         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
608             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
609             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
610             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
611                 log_error("Failed to regiser fds in epoll: %m");
612                 r = -errno;
613                 goto finish;
614         }
615
616         for (;;) {
617                 struct epoll_event ev[16];
618                 ssize_t k;
619                 int i, nfds;
620
621                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
622
623                         if (errno == EINTR || errno == EAGAIN)
624                                 continue;
625
626                         log_error("epoll_wait(): %m");
627                         r = -errno;
628                         goto finish;
629                 }
630
631                 assert(nfds >= 1);
632
633                 for (i = 0; i < nfds; i++) {
634                         if (ev[i].data.fd == STDIN_FILENO) {
635
636                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
637                                         stdin_readable = true;
638
639                         } else if (ev[i].data.fd == STDOUT_FILENO) {
640
641                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
642                                         stdout_writable = true;
643
644                         } else if (ev[i].data.fd == master) {
645
646                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
647                                         master_readable = true;
648
649                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
650                                         master_writable = true;
651
652                         } else if (ev[i].data.fd == signal_fd) {
653                                 struct signalfd_siginfo sfsi;
654                                 ssize_t n;
655
656                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
657
658                                         if (n >= 0) {
659                                                 log_error("Failed to read from signalfd: invalid block size");
660                                                 r = -EIO;
661                                                 goto finish;
662                                         }
663
664                                         if (errno != EINTR && errno != EAGAIN) {
665                                                 log_error("Failed to read from signalfd: %m");
666                                                 r = -errno;
667                                                 goto finish;
668                                         }
669                                 } else {
670
671                                         if (sfsi.ssi_signo == SIGWINCH) {
672                                                 struct winsize ws;
673
674                                                 /* The window size changed, let's forward that. */
675                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
676                                                         ioctl(master, TIOCSWINSZ, &ws);
677                                         } else {
678                                                 r = 0;
679                                                 goto finish;
680                                         }
681                                 }
682                         }
683                 }
684
685                 while ((stdin_readable && in_buffer_full <= 0) ||
686                        (master_writable && in_buffer_full > 0) ||
687                        (master_readable && out_buffer_full <= 0) ||
688                        (stdout_writable && out_buffer_full > 0)) {
689
690                         if (stdin_readable && in_buffer_full < LINE_MAX) {
691
692                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
693
694                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
695                                                 stdin_readable = false;
696                                         else {
697                                                 log_error("read(): %m");
698                                                 r = -errno;
699                                                 goto finish;
700                                         }
701                                 } else
702                                         in_buffer_full += (size_t) k;
703                         }
704
705                         if (master_writable && in_buffer_full > 0) {
706
707                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
708
709                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
710                                                 master_writable = false;
711                                         else {
712                                                 log_error("write(): %m");
713                                                 r = -errno;
714                                                 goto finish;
715                                         }
716
717                                 } else {
718                                         assert(in_buffer_full >= (size_t) k);
719                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
720                                         in_buffer_full -= k;
721                                 }
722                         }
723
724                         if (master_readable && out_buffer_full < LINE_MAX) {
725
726                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
727
728                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
729                                                 master_readable = false;
730                                         else {
731                                                 log_error("read(): %m");
732                                                 r = -errno;
733                                                 goto finish;
734                                         }
735                                 }  else
736                                         out_buffer_full += (size_t) k;
737                         }
738
739                         if (stdout_writable && out_buffer_full > 0) {
740
741                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
742
743                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
744                                                 stdout_writable = false;
745                                         else {
746                                                 log_error("write(): %m");
747                                                 r = -errno;
748                                                 goto finish;
749                                         }
750
751                                 } else {
752                                         assert(out_buffer_full >= (size_t) k);
753                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
754                                         out_buffer_full -= k;
755                                 }
756                         }
757                 }
758         }
759
760 finish:
761         if (ep >= 0)
762                 close_nointr_nofail(ep);
763
764         if (signal_fd >= 0)
765                 close_nointr_nofail(signal_fd);
766
767         return r;
768 }
769
770 int main(int argc, char *argv[]) {
771         pid_t pid = 0;
772         int r = EXIT_FAILURE, k;
773         char *oldcg = NULL, *newcg = NULL;
774         char **controller = NULL;
775         int master = -1;
776         const char *console = NULL;
777         struct termios saved_attr, raw_attr;
778         sigset_t mask;
779         bool saved_attr_valid = false;
780         struct winsize ws;
781         int kmsg_socket_pair[2] = { -1, -1 };
782
783         log_parse_environment();
784         log_open();
785
786         if ((r = parse_argv(argc, argv)) <= 0)
787                 goto finish;
788
789         if (arg_directory) {
790                 char *p;
791
792                 p = path_make_absolute_cwd(arg_directory);
793                 free(arg_directory);
794                 arg_directory = p;
795         } else
796                 arg_directory = get_current_dir_name();
797
798         if (!arg_directory) {
799                 log_error("Failed to determine path");
800                 goto finish;
801         }
802
803         path_kill_slashes(arg_directory);
804
805         if (geteuid() != 0) {
806                 log_error("Need to be root.");
807                 goto finish;
808         }
809
810         if (sd_booted() <= 0) {
811                 log_error("Not running on a systemd system.");
812                 goto finish;
813         }
814
815         if (path_equal(arg_directory, "/")) {
816                 log_error("Spawning container on root directory not supported.");
817                 goto finish;
818         }
819
820         if (is_os_tree(arg_directory) <= 0) {
821                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
822                 goto finish;
823         }
824
825         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
826                 log_error("Failed to determine current cgroup: %s", strerror(-k));
827                 goto finish;
828         }
829
830         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
831                 log_error("Failed to allocate cgroup path.");
832                 goto finish;
833         }
834
835         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
836         if (k < 0)  {
837                 log_error("Failed to create cgroup: %s", strerror(-k));
838                 goto finish;
839         }
840
841         STRV_FOREACH(controller,arg_controllers) {
842                 k = cg_create_and_attach(*controller, newcg, 0);
843                 if (k < 0)
844                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
845         }
846
847         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
848                 log_error("Failed to acquire pseudo tty: %m");
849                 goto finish;
850         }
851
852         if (!(console = ptsname(master))) {
853                 log_error("Failed to determine tty name: %m");
854                 goto finish;
855         }
856
857         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
858
859         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
860                 ioctl(master, TIOCSWINSZ, &ws);
861
862         if (unlockpt(master) < 0) {
863                 log_error("Failed to unlock tty: %m");
864                 goto finish;
865         }
866
867         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
868                 log_error("Failed to get terminal attributes: %m");
869                 goto finish;
870         }
871
872         saved_attr_valid = true;
873
874         raw_attr = saved_attr;
875         cfmakeraw(&raw_attr);
876         raw_attr.c_lflag &= ~ECHO;
877
878         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
879                 log_error("Failed to set terminal attributes: %m");
880                 goto finish;
881         }
882
883         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
884                 log_error("Failed to create kmsg socket pair");
885                 goto finish;
886         }
887
888         assert_se(sigemptyset(&mask) == 0);
889         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
890         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
891
892         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
893         if (pid < 0) {
894                 if (errno == EINVAL)
895                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
896                 else
897                         log_error("clone() failed: %m");
898
899                 goto finish;
900         }
901
902         if (pid == 0) {
903                 /* child */
904
905                 const char *home = NULL;
906                 uid_t uid = (uid_t) -1;
907                 gid_t gid = (gid_t) -1;
908                 const char *envp[] = {
909                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
910                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
911                         NULL, /* TERM */
912                         NULL, /* HOME */
913                         NULL, /* USER */
914                         NULL, /* LOGNAME */
915                         NULL
916                 };
917
918                 envp[2] = strv_find_prefix(environ, "TERM=");
919
920                 close_nointr_nofail(master);
921
922                 close_nointr(STDIN_FILENO);
923                 close_nointr(STDOUT_FILENO);
924                 close_nointr(STDERR_FILENO);
925
926                 close_all_fds(&kmsg_socket_pair[1], 1);
927
928                 reset_all_signal_handlers();
929
930                 assert_se(sigemptyset(&mask) == 0);
931                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
932
933                 if (setsid() < 0)
934                         goto child_fail;
935
936                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
937                         goto child_fail;
938
939                 /* Mark / as private, in case somebody marked it shared */
940                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
941                         goto child_fail;
942
943                 if (mount_all(arg_directory) < 0)
944                         goto child_fail;
945
946                 if (copy_devnodes(arg_directory) < 0)
947                         goto child_fail;
948
949                 if (setup_dev_console(arg_directory, console) < 0)
950                         goto child_fail;
951
952                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
953                         goto child_fail;
954
955                 close_nointr_nofail(kmsg_socket_pair[1]);
956
957                 if (setup_timezone(arg_directory) < 0)
958                         goto child_fail;
959
960                 if (chdir(arg_directory) < 0) {
961                         log_error("chdir(%s) failed: %m", arg_directory);
962                         goto child_fail;
963                 }
964
965                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
966                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
967                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
968                         goto child_fail;
969
970                 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
971                         log_error("mount(MS_MOVE) failed: %m");
972                         goto child_fail;
973                 }
974
975                 if (chroot(".") < 0) {
976                         log_error("chroot() failed: %m");
977                         goto child_fail;
978                 }
979
980                 if (chdir("/") < 0) {
981                         log_error("chdir() failed: %m");
982                         goto child_fail;
983                 }
984
985                 umask(0022);
986
987                 loopback_setup();
988
989                 if (drop_capabilities() < 0)
990                         goto child_fail;
991
992                 if (arg_user) {
993
994                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
995                                 log_error("get_user_creds() failed: %m");
996                                 goto child_fail;
997                         }
998
999                         if (mkdir_parents(home, 0775) < 0) {
1000                                 log_error("mkdir_parents() failed: %m");
1001                                 goto child_fail;
1002                         }
1003
1004                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
1005                                 log_error("safe_mkdir() failed: %m");
1006                                 goto child_fail;
1007                         }
1008
1009                         if (initgroups((const char*)arg_user, gid) < 0) {
1010                                 log_error("initgroups() failed: %m");
1011                                 goto child_fail;
1012                         }
1013
1014                         if (setresgid(gid, gid, gid) < 0) {
1015                                 log_error("setregid() failed: %m");
1016                                 goto child_fail;
1017                         }
1018
1019                         if (setresuid(uid, uid, uid) < 0) {
1020                                 log_error("setreuid() failed: %m");
1021                                 goto child_fail;
1022                         }
1023                 }
1024
1025                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
1026                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
1027                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
1028                     log_error("Out of memory");
1029                     goto child_fail;
1030                 }
1031
1032                 setup_hostname();
1033
1034                 if (arg_boot) {
1035                         char **a;
1036                         size_t l;
1037
1038                         /* Automatically search for the init system */
1039
1040                         l = 1 + argc - optind;
1041                         a = newa(char*, l + 1);
1042                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1043
1044                         a[0] = (char*) "/usr/lib/systemd/systemd";
1045                         execve(a[0], a, (char**) envp);
1046
1047                         a[0] = (char*) "/lib/systemd/systemd";
1048                         execve(a[0], a, (char**) envp);
1049
1050                         a[0] = (char*) "/sbin/init";
1051                         execve(a[0], a, (char**) envp);
1052                 } else if (argc > optind)
1053                         execvpe(argv[optind], argv + optind, (char**) envp);
1054                 else {
1055                         chdir(home ? home : "/root");
1056                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1057                 }
1058
1059                 log_error("execv() failed: %m");
1060
1061         child_fail:
1062                 _exit(EXIT_FAILURE);
1063         }
1064
1065         if (process_pty(master, &mask) < 0)
1066                 goto finish;
1067
1068         if (saved_attr_valid) {
1069                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1070                 saved_attr_valid = false;
1071         }
1072
1073         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1074
1075         if (r < 0)
1076                 r = EXIT_FAILURE;
1077
1078 finish:
1079         if (saved_attr_valid)
1080                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1081
1082         if (master >= 0)
1083                 close_nointr_nofail(master);
1084
1085         close_pipe(kmsg_socket_pair);
1086
1087         if (oldcg)
1088                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1089
1090         if (newcg)
1091                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1092
1093         free(arg_directory);
1094         strv_free(arg_controllers);
1095         free(oldcg);
1096         free(newcg);
1097
1098         return r;
1099 }