chiark / gitweb /
nspawn: add --read-only switch
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static char *arg_uuid = NULL;
59 static bool arg_private_network = false;
60 static bool arg_read_only = false;
61 static bool arg_boot = false;
62
63 static int help(void) {
64
65         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
66                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
67                "  -h --help             Show this help\n"
68                "  -D --directory=NAME   Root directory for the container\n"
69                "  -b --boot             Boot up full system (i.e. invoke init)\n"
70                "  -u --user=USER        Run the command under specified user or uid\n"
71                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
72                "     --uuid=UUID        Set a specific machine UUID for the container\n"
73                "     --private-network  Disable network in container\n"
74                "     --read-only        Mount the root directory read-only\n",
75                program_invocation_short_name);
76
77         return 0;
78 }
79
80 static int parse_argv(int argc, char *argv[]) {
81
82         enum {
83                 ARG_PRIVATE_NETWORK = 0x100,
84                 ARG_UUID,
85                 ARG_READ_ONLY
86         };
87
88         static const struct option options[] = {
89                 { "help",            no_argument,       NULL, 'h'                 },
90                 { "directory",       required_argument, NULL, 'D'                 },
91                 { "user",            required_argument, NULL, 'u'                 },
92                 { "controllers",     required_argument, NULL, 'C'                 },
93                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
94                 { "boot",            no_argument,       NULL, 'b'                 },
95                 { "uuid",            required_argument, NULL, ARG_UUID            },
96                 { "read-only",       no_argument,       NULL, ARG_READ_ONLY       },
97                 { NULL,              0,                 NULL, 0                   }
98         };
99
100         int c;
101
102         assert(argc >= 0);
103         assert(argv);
104
105         while ((c = getopt_long(argc, argv, "+hD:u:C:b", options, NULL)) >= 0) {
106
107                 switch (c) {
108
109                 case 'h':
110                         help();
111                         return 0;
112
113                 case 'D':
114                         free(arg_directory);
115                         arg_directory = canonicalize_file_name(optarg);
116                         if (!arg_directory) {
117                                 log_error("Failed to canonicalize root directory.");
118                                 return -ENOMEM;
119                         }
120
121                         break;
122
123                 case 'u':
124                         free(arg_user);
125                         if (!(arg_user = strdup(optarg))) {
126                                 log_error("Failed to duplicate user name.");
127                                 return -ENOMEM;
128                         }
129
130                         break;
131
132                 case 'C':
133                         strv_free(arg_controllers);
134                         arg_controllers = strv_split(optarg, ",");
135                         if (!arg_controllers) {
136                                 log_error("Failed to split controllers list.");
137                                 return -ENOMEM;
138                         }
139                         strv_uniq(arg_controllers);
140
141                         break;
142
143                 case ARG_PRIVATE_NETWORK:
144                         arg_private_network = true;
145                         break;
146
147                 case 'b':
148                         arg_boot = true;
149                         break;
150
151                 case ARG_UUID:
152                         arg_uuid = optarg;
153                         break;
154
155                 case ARG_READ_ONLY:
156                         arg_read_only = true;
157                         break;
158
159                 case '?':
160                         return -EINVAL;
161
162                 default:
163                         log_error("Unknown option code %c", c);
164                         return -EINVAL;
165                 }
166         }
167
168         return 1;
169 }
170
171 static int mount_all(const char *dest) {
172
173         typedef struct MountPoint {
174                 const char *what;
175                 const char *where;
176                 const char *type;
177                 const char *options;
178                 unsigned long flags;
179                 bool fatal;
180         } MountPoint;
181
182         static const MountPoint mount_table[] = {
183                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
184                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
185                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
186                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
187                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
188                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
189                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
190                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
191 #ifdef HAVE_SELINUX
192                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
193                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
194 #endif
195         };
196
197         unsigned k;
198         int r = 0;
199         char *where;
200
201         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
202                 int t;
203
204                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
205                         log_error("Out of memory");
206
207                         if (r == 0)
208                                 r = -ENOMEM;
209
210                         break;
211                 }
212
213                 t = path_is_mount_point(where, false);
214                 if (t < 0) {
215                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
216                         free(where);
217
218                         if (r == 0)
219                                 r = t;
220
221                         continue;
222                 }
223
224                 mkdir_p(where, 0755);
225
226                 if (mount(mount_table[k].what,
227                           where,
228                           mount_table[k].type,
229                           mount_table[k].flags,
230                           mount_table[k].options) < 0 &&
231                     mount_table[k].fatal) {
232
233                         log_error("mount(%s) failed: %m", where);
234
235                         if (r == 0)
236                                 r = -errno;
237                 }
238
239                 free(where);
240         }
241
242         return r;
243 }
244
245 static int setup_timezone(const char *dest) {
246         char *where;
247
248         assert(dest);
249
250         /* Fix the timezone, if possible */
251         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
252                 log_error("Out of memory");
253                 return -ENOMEM;
254         }
255
256         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
257                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
258
259         free(where);
260
261         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
262                 log_error("Out of memory");
263                 return -ENOMEM;
264         }
265
266         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
267                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
268
269         free(where);
270
271         return 0;
272 }
273
274 static int setup_resolv_conf(const char *dest) {
275         char *where;
276
277         assert(dest);
278
279         if (arg_private_network)
280                 return 0;
281
282         /* Fix resolv.conf, if possible */
283         if (asprintf(&where, "%s/etc/resolv.conf", dest) < 0) {
284                 log_error("Out of memory");
285                 return -ENOMEM;
286         }
287
288         if (mount("/etc/resolv.conf", where, "bind", MS_BIND, NULL) >= 0)
289                 mount("/etc/resolv.conf", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
290
291         free(where);
292
293         return 0;
294 }
295
296 static int copy_devnodes(const char *dest) {
297
298         static const char devnodes[] =
299                 "null\0"
300                 "zero\0"
301                 "full\0"
302                 "random\0"
303                 "urandom\0"
304                 "tty\0"
305                 "ptmx\0"
306                 "rtc0\0";
307
308         const char *d;
309         int r = 0;
310         mode_t u;
311
312         assert(dest);
313
314         u = umask(0000);
315
316         NULSTR_FOREACH(d, devnodes) {
317                 struct stat st;
318                 char *from = NULL, *to = NULL;
319
320                 asprintf(&from, "/dev/%s", d);
321                 asprintf(&to, "%s/dev/%s", dest, d);
322
323                 if (!from || !to) {
324                         log_error("Failed to allocate devnode path");
325
326                         free(from);
327                         free(to);
328
329                         from = to = NULL;
330
331                         if (r == 0)
332                                 r = -ENOMEM;
333
334                         break;
335                 }
336
337                 if (stat(from, &st) < 0) {
338
339                         if (errno != ENOENT) {
340                                 log_error("Failed to stat %s: %m", from);
341                                 if (r == 0)
342                                         r = -errno;
343                         }
344
345                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
346
347                         log_error("%s is not a char or block device, cannot copy.", from);
348                         if (r == 0)
349                                 r = -EIO;
350
351                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
352
353                         log_error("mknod(%s) failed: %m", dest);
354                         if (r == 0)
355                                 r = -errno;
356                 }
357
358                 free(from);
359                 free(to);
360         }
361
362         umask(u);
363
364         return r;
365 }
366
367 static int setup_dev_console(const char *dest, const char *console) {
368         struct stat st;
369         char *to = NULL;
370         int r;
371         mode_t u;
372
373         assert(dest);
374         assert(console);
375
376         u = umask(0000);
377
378         if (stat(console, &st) < 0) {
379                 log_error("Failed to stat %s: %m", console);
380                 r = -errno;
381                 goto finish;
382
383         } else if (!S_ISCHR(st.st_mode)) {
384                 log_error("/dev/console is not a char device.");
385                 r = -EIO;
386                 goto finish;
387         }
388
389         r = chmod_and_chown(console, 0600, 0, 0);
390         if (r < 0) {
391                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
392                 goto finish;
393         }
394
395         if (asprintf(&to, "%s/dev/console", dest) < 0) {
396                 log_error("Out of memory");
397                 r = -ENOMEM;
398                 goto finish;
399         }
400
401         /* We need to bind mount the right tty to /dev/console since
402          * ptys can only exist on pts file systems. To have something
403          * to bind mount things on we create a device node first, that
404          * has the right major/minor (note that the major minor
405          * doesn't actually matter here, since we mount it over
406          * anyway). */
407
408         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
409                 log_error("mknod() for /dev/console failed: %m");
410                 r = -errno;
411                 goto finish;
412         }
413
414         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
415                 log_error("Bind mount for /dev/console failed: %m");
416                 r = -errno;
417                 goto finish;
418         }
419
420 finish:
421         free(to);
422         umask(u);
423
424         return r;
425 }
426
427 static int setup_kmsg(const char *dest, int kmsg_socket) {
428         char *from = NULL, *to = NULL;
429         int r, fd, k;
430         mode_t u;
431         union {
432                 struct cmsghdr cmsghdr;
433                 uint8_t buf[CMSG_SPACE(sizeof(int))];
434         } control;
435         struct msghdr mh;
436         struct cmsghdr *cmsg;
437
438         assert(dest);
439         assert(kmsg_socket >= 0);
440
441         u = umask(0000);
442
443         /* We create the kmsg FIFO as /dev/kmsg, but immediately
444          * delete it after bind mounting it to /proc/kmsg. While FIFOs
445          * on the reading side behave very similar to /proc/kmsg,
446          * their writing side behaves differently from /dev/kmsg in
447          * that writing blocks when nothing is reading. In order to
448          * avoid any problems with containers deadlocking due to this
449          * we simply make /dev/kmsg unavailable to the container. */
450         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
451                 log_error("Out of memory");
452                 r = -ENOMEM;
453                 goto finish;
454         }
455
456         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
457                 log_error("Out of memory");
458                 r = -ENOMEM;
459                 goto finish;
460         }
461
462         if (mkfifo(from, 0600) < 0) {
463                 log_error("mkfifo() for /dev/kmsg failed: %m");
464                 r = -errno;
465                 goto finish;
466         }
467
468         r = chmod_and_chown(from, 0600, 0, 0);
469         if (r < 0) {
470                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
471                 goto finish;
472         }
473
474         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
475                 log_error("Bind mount for /proc/kmsg failed: %m");
476                 r = -errno;
477                 goto finish;
478         }
479
480         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
481         if (fd < 0) {
482                 log_error("Failed to open fifo: %m");
483                 r = -errno;
484                 goto finish;
485         }
486
487         zero(mh);
488         zero(control);
489
490         mh.msg_control = &control;
491         mh.msg_controllen = sizeof(control);
492
493         cmsg = CMSG_FIRSTHDR(&mh);
494         cmsg->cmsg_level = SOL_SOCKET;
495         cmsg->cmsg_type = SCM_RIGHTS;
496         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
497         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
498
499         mh.msg_controllen = cmsg->cmsg_len;
500
501         /* Store away the fd in the socket, so that it stays open as
502          * long as we run the child */
503         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
504         close_nointr_nofail(fd);
505
506         if (k < 0) {
507                 log_error("Failed to send FIFO fd: %m");
508                 r = -errno;
509                 goto finish;
510         }
511
512         /* And now make the FIFO unavailable as /dev/kmsg... */
513         unlink(from);
514
515 finish:
516         free(from);
517         free(to);
518         umask(u);
519
520         return r;
521 }
522
523 static int setup_hostname(void) {
524         char *hn;
525         int r = 0;
526
527         hn = file_name_from_path(arg_directory);
528         if (hn) {
529                 hn = strdup(hn);
530                 if (!hn)
531                         return -ENOMEM;
532
533                 hostname_cleanup(hn);
534
535                 if (!isempty(hn))
536                         if (sethostname(hn, strlen(hn)) < 0)
537                                 r = -errno;
538
539                 free(hn);
540         }
541
542         return r;
543 }
544
545 static int drop_capabilities(void) {
546         static const unsigned long retain[] = {
547                 CAP_CHOWN,
548                 CAP_DAC_OVERRIDE,
549                 CAP_DAC_READ_SEARCH,
550                 CAP_FOWNER,
551                 CAP_FSETID,
552                 CAP_IPC_OWNER,
553                 CAP_KILL,
554                 CAP_LEASE,
555                 CAP_LINUX_IMMUTABLE,
556                 CAP_NET_BIND_SERVICE,
557                 CAP_NET_BROADCAST,
558                 CAP_NET_RAW,
559                 CAP_SETGID,
560                 CAP_SETFCAP,
561                 CAP_SETPCAP,
562                 CAP_SETUID,
563                 CAP_SYS_ADMIN,
564                 CAP_SYS_CHROOT,
565                 CAP_SYS_NICE,
566                 CAP_SYS_PTRACE,
567                 CAP_SYS_TTY_CONFIG
568         };
569
570         unsigned long l;
571
572         for (l = 0; l <= cap_last_cap(); l++) {
573                 unsigned i;
574
575                 for (i = 0; i < ELEMENTSOF(retain); i++)
576                         if (retain[i] == l)
577                                 break;
578
579                 if (i < ELEMENTSOF(retain))
580                         continue;
581
582                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
583                         log_error("PR_CAPBSET_DROP failed: %m");
584                         return -errno;
585                 }
586         }
587
588         return 0;
589 }
590
591 static int is_os_tree(const char *path) {
592         int r;
593         char *p;
594         /* We use /bin/sh as flag file if something is an OS */
595
596         if (asprintf(&p, "%s/bin/sh", path) < 0)
597                 return -ENOMEM;
598
599         r = access(p, F_OK);
600         free(p);
601
602         return r < 0 ? 0 : 1;
603 }
604
605 static int process_pty(int master, sigset_t *mask) {
606
607         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
608         size_t in_buffer_full = 0, out_buffer_full = 0;
609         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
610         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
611         int ep = -1, signal_fd = -1, r;
612
613         fd_nonblock(STDIN_FILENO, 1);
614         fd_nonblock(STDOUT_FILENO, 1);
615         fd_nonblock(master, 1);
616
617         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
618                 log_error("signalfd(): %m");
619                 r = -errno;
620                 goto finish;
621         }
622
623         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
624                 log_error("Failed to create epoll: %m");
625                 r = -errno;
626                 goto finish;
627         }
628
629         zero(stdin_ev);
630         stdin_ev.events = EPOLLIN|EPOLLET;
631         stdin_ev.data.fd = STDIN_FILENO;
632
633         zero(stdout_ev);
634         stdout_ev.events = EPOLLOUT|EPOLLET;
635         stdout_ev.data.fd = STDOUT_FILENO;
636
637         zero(master_ev);
638         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
639         master_ev.data.fd = master;
640
641         zero(signal_ev);
642         signal_ev.events = EPOLLIN;
643         signal_ev.data.fd = signal_fd;
644
645         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
646             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
647             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
648             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
649                 log_error("Failed to regiser fds in epoll: %m");
650                 r = -errno;
651                 goto finish;
652         }
653
654         for (;;) {
655                 struct epoll_event ev[16];
656                 ssize_t k;
657                 int i, nfds;
658
659                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
660
661                         if (errno == EINTR || errno == EAGAIN)
662                                 continue;
663
664                         log_error("epoll_wait(): %m");
665                         r = -errno;
666                         goto finish;
667                 }
668
669                 assert(nfds >= 1);
670
671                 for (i = 0; i < nfds; i++) {
672                         if (ev[i].data.fd == STDIN_FILENO) {
673
674                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
675                                         stdin_readable = true;
676
677                         } else if (ev[i].data.fd == STDOUT_FILENO) {
678
679                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
680                                         stdout_writable = true;
681
682                         } else if (ev[i].data.fd == master) {
683
684                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
685                                         master_readable = true;
686
687                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
688                                         master_writable = true;
689
690                         } else if (ev[i].data.fd == signal_fd) {
691                                 struct signalfd_siginfo sfsi;
692                                 ssize_t n;
693
694                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
695
696                                         if (n >= 0) {
697                                                 log_error("Failed to read from signalfd: invalid block size");
698                                                 r = -EIO;
699                                                 goto finish;
700                                         }
701
702                                         if (errno != EINTR && errno != EAGAIN) {
703                                                 log_error("Failed to read from signalfd: %m");
704                                                 r = -errno;
705                                                 goto finish;
706                                         }
707                                 } else {
708
709                                         if (sfsi.ssi_signo == SIGWINCH) {
710                                                 struct winsize ws;
711
712                                                 /* The window size changed, let's forward that. */
713                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
714                                                         ioctl(master, TIOCSWINSZ, &ws);
715                                         } else {
716                                                 r = 0;
717                                                 goto finish;
718                                         }
719                                 }
720                         }
721                 }
722
723                 while ((stdin_readable && in_buffer_full <= 0) ||
724                        (master_writable && in_buffer_full > 0) ||
725                        (master_readable && out_buffer_full <= 0) ||
726                        (stdout_writable && out_buffer_full > 0)) {
727
728                         if (stdin_readable && in_buffer_full < LINE_MAX) {
729
730                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
731
732                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
733                                                 stdin_readable = false;
734                                         else {
735                                                 log_error("read(): %m");
736                                                 r = -errno;
737                                                 goto finish;
738                                         }
739                                 } else
740                                         in_buffer_full += (size_t) k;
741                         }
742
743                         if (master_writable && in_buffer_full > 0) {
744
745                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
746
747                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
748                                                 master_writable = false;
749                                         else {
750                                                 log_error("write(): %m");
751                                                 r = -errno;
752                                                 goto finish;
753                                         }
754
755                                 } else {
756                                         assert(in_buffer_full >= (size_t) k);
757                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
758                                         in_buffer_full -= k;
759                                 }
760                         }
761
762                         if (master_readable && out_buffer_full < LINE_MAX) {
763
764                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
765
766                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
767                                                 master_readable = false;
768                                         else {
769                                                 log_error("read(): %m");
770                                                 r = -errno;
771                                                 goto finish;
772                                         }
773                                 }  else
774                                         out_buffer_full += (size_t) k;
775                         }
776
777                         if (stdout_writable && out_buffer_full > 0) {
778
779                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
780
781                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
782                                                 stdout_writable = false;
783                                         else {
784                                                 log_error("write(): %m");
785                                                 r = -errno;
786                                                 goto finish;
787                                         }
788
789                                 } else {
790                                         assert(out_buffer_full >= (size_t) k);
791                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
792                                         out_buffer_full -= k;
793                                 }
794                         }
795                 }
796         }
797
798 finish:
799         if (ep >= 0)
800                 close_nointr_nofail(ep);
801
802         if (signal_fd >= 0)
803                 close_nointr_nofail(signal_fd);
804
805         return r;
806 }
807
808 int main(int argc, char *argv[]) {
809         pid_t pid = 0;
810         int r = EXIT_FAILURE, k;
811         char *oldcg = NULL, *newcg = NULL;
812         char **controller = NULL;
813         int master = -1;
814         const char *console = NULL;
815         struct termios saved_attr, raw_attr;
816         sigset_t mask;
817         bool saved_attr_valid = false;
818         struct winsize ws;
819         int kmsg_socket_pair[2] = { -1, -1 };
820
821         log_parse_environment();
822         log_open();
823
824         if ((r = parse_argv(argc, argv)) <= 0)
825                 goto finish;
826
827         if (arg_directory) {
828                 char *p;
829
830                 p = path_make_absolute_cwd(arg_directory);
831                 free(arg_directory);
832                 arg_directory = p;
833         } else
834                 arg_directory = get_current_dir_name();
835
836         if (!arg_directory) {
837                 log_error("Failed to determine path");
838                 goto finish;
839         }
840
841         path_kill_slashes(arg_directory);
842
843         if (geteuid() != 0) {
844                 log_error("Need to be root.");
845                 goto finish;
846         }
847
848         if (sd_booted() <= 0) {
849                 log_error("Not running on a systemd system.");
850                 goto finish;
851         }
852
853         if (path_equal(arg_directory, "/")) {
854                 log_error("Spawning container on root directory not supported.");
855                 goto finish;
856         }
857
858         if (is_os_tree(arg_directory) <= 0) {
859                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
860                 goto finish;
861         }
862
863         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
864                 log_error("Failed to determine current cgroup: %s", strerror(-k));
865                 goto finish;
866         }
867
868         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
869                 log_error("Failed to allocate cgroup path.");
870                 goto finish;
871         }
872
873         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
874         if (k < 0)  {
875                 log_error("Failed to create cgroup: %s", strerror(-k));
876                 goto finish;
877         }
878
879         STRV_FOREACH(controller,arg_controllers) {
880                 k = cg_create_and_attach(*controller, newcg, 0);
881                 if (k < 0)
882                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
883         }
884
885         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
886                 log_error("Failed to acquire pseudo tty: %m");
887                 goto finish;
888         }
889
890         if (!(console = ptsname(master))) {
891                 log_error("Failed to determine tty name: %m");
892                 goto finish;
893         }
894
895         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
896
897         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
898                 ioctl(master, TIOCSWINSZ, &ws);
899
900         if (unlockpt(master) < 0) {
901                 log_error("Failed to unlock tty: %m");
902                 goto finish;
903         }
904
905         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
906                 log_error("Failed to get terminal attributes: %m");
907                 goto finish;
908         }
909
910         saved_attr_valid = true;
911
912         raw_attr = saved_attr;
913         cfmakeraw(&raw_attr);
914         raw_attr.c_lflag &= ~ECHO;
915
916         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
917                 log_error("Failed to set terminal attributes: %m");
918                 goto finish;
919         }
920
921         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
922                 log_error("Failed to create kmsg socket pair");
923                 goto finish;
924         }
925
926         assert_se(sigemptyset(&mask) == 0);
927         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
928         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
929
930         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
931         if (pid < 0) {
932                 if (errno == EINVAL)
933                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
934                 else
935                         log_error("clone() failed: %m");
936
937                 goto finish;
938         }
939
940         if (pid == 0) {
941                 /* child */
942
943                 const char *home = NULL;
944                 uid_t uid = (uid_t) -1;
945                 gid_t gid = (gid_t) -1;
946                 const char *envp[] = {
947                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
948                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
949                         NULL, /* TERM */
950                         NULL, /* HOME */
951                         NULL, /* USER */
952                         NULL, /* LOGNAME */
953                         NULL, /* container_uuid */
954                         NULL
955                 };
956
957                 envp[2] = strv_find_prefix(environ, "TERM=");
958
959                 close_nointr_nofail(master);
960
961                 close_nointr(STDIN_FILENO);
962                 close_nointr(STDOUT_FILENO);
963                 close_nointr(STDERR_FILENO);
964
965                 close_all_fds(&kmsg_socket_pair[1], 1);
966
967                 reset_all_signal_handlers();
968
969                 assert_se(sigemptyset(&mask) == 0);
970                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
971
972                 if (setsid() < 0)
973                         goto child_fail;
974
975                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
976                         goto child_fail;
977
978                 /* Mark / as private, in case somebody marked it shared */
979                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
980                         goto child_fail;
981
982                 /* Turn directory into bind mount */
983                 if (mount(arg_directory, arg_directory, "bind", MS_BIND, NULL) < 0) {
984                         log_error("Failed to make bind mount.");
985                         goto child_fail;
986                 }
987
988                 if (arg_read_only)
989                         if (mount(arg_directory, arg_directory, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL) < 0) {
990                                 log_error("Failed to make read-only.");
991                                 goto child_fail;
992                         }
993
994                 if (mount_all(arg_directory) < 0)
995                         goto child_fail;
996
997                 if (copy_devnodes(arg_directory) < 0)
998                         goto child_fail;
999
1000                 if (setup_dev_console(arg_directory, console) < 0)
1001                         goto child_fail;
1002
1003                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
1004                         goto child_fail;
1005
1006                 close_nointr_nofail(kmsg_socket_pair[1]);
1007
1008                 if (setup_timezone(arg_directory) < 0)
1009                         goto child_fail;
1010
1011                 if (setup_resolv_conf(arg_directory) < 0)
1012                         goto child_fail;
1013
1014                 if (chdir(arg_directory) < 0) {
1015                         log_error("chdir(%s) failed: %m", arg_directory);
1016                         goto child_fail;
1017                 }
1018
1019                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
1020                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
1021                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
1022                         goto child_fail;
1023
1024                 if (mount(arg_directory, "/", "bind", MS_MOVE, NULL) < 0) {
1025                         log_error("mount(MS_BIND) failed: %m");
1026                         goto child_fail;
1027                 }
1028
1029                 if (chroot(".") < 0) {
1030                         log_error("chroot() failed: %m");
1031                         goto child_fail;
1032                 }
1033
1034                 if (chdir("/") < 0) {
1035                         log_error("chdir() failed: %m");
1036                         goto child_fail;
1037                 }
1038
1039                 umask(0022);
1040
1041                 loopback_setup();
1042
1043                 if (drop_capabilities() < 0)
1044                         goto child_fail;
1045
1046                 if (arg_user) {
1047
1048                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
1049                                 log_error("get_user_creds() failed: %m");
1050                                 goto child_fail;
1051                         }
1052
1053                         if (mkdir_parents(home, 0775) < 0) {
1054                                 log_error("mkdir_parents() failed: %m");
1055                                 goto child_fail;
1056                         }
1057
1058                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
1059                                 log_error("safe_mkdir() failed: %m");
1060                                 goto child_fail;
1061                         }
1062
1063                         if (initgroups((const char*)arg_user, gid) < 0) {
1064                                 log_error("initgroups() failed: %m");
1065                                 goto child_fail;
1066                         }
1067
1068                         if (setresgid(gid, gid, gid) < 0) {
1069                                 log_error("setregid() failed: %m");
1070                                 goto child_fail;
1071                         }
1072
1073                         if (setresuid(uid, uid, uid) < 0) {
1074                                 log_error("setreuid() failed: %m");
1075                                 goto child_fail;
1076                         }
1077                 }
1078
1079                 if ((asprintf((char**)(envp + 3), "HOME=%s", home ? home: "/root") < 0) ||
1080                     (asprintf((char**)(envp + 4), "USER=%s", arg_user ? arg_user : "root") < 0) ||
1081                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user ? arg_user : "root") < 0)) {
1082                     log_error("Out of memory");
1083                     goto child_fail;
1084                 }
1085
1086                 if (arg_uuid) {
1087                         if (asprintf((char**)(envp + 6), "container_uuid=%s", arg_uuid) < 0) {
1088                                 log_error("Out of memory");
1089                                 goto child_fail;
1090                         }
1091                 }
1092
1093                 setup_hostname();
1094
1095                 if (arg_boot) {
1096                         char **a;
1097                         size_t l;
1098
1099                         /* Automatically search for the init system */
1100
1101                         l = 1 + argc - optind;
1102                         a = newa(char*, l + 1);
1103                         memcpy(a + 1, argv + optind, l * sizeof(char*));
1104
1105                         a[0] = (char*) "/usr/lib/systemd/systemd";
1106                         execve(a[0], a, (char**) envp);
1107
1108                         a[0] = (char*) "/lib/systemd/systemd";
1109                         execve(a[0], a, (char**) envp);
1110
1111                         a[0] = (char*) "/sbin/init";
1112                         execve(a[0], a, (char**) envp);
1113                 } else if (argc > optind)
1114                         execvpe(argv[optind], argv + optind, (char**) envp);
1115                 else {
1116                         chdir(home ? home : "/root");
1117                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1118                 }
1119
1120                 log_error("execv() failed: %m");
1121
1122         child_fail:
1123                 _exit(EXIT_FAILURE);
1124         }
1125
1126         if (process_pty(master, &mask) < 0)
1127                 goto finish;
1128
1129         if (saved_attr_valid) {
1130                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1131                 saved_attr_valid = false;
1132         }
1133
1134         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1135
1136         if (r < 0)
1137                 r = EXIT_FAILURE;
1138
1139 finish:
1140         if (saved_attr_valid)
1141                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1142
1143         if (master >= 0)
1144                 close_nointr_nofail(master);
1145
1146         close_pipe(kmsg_socket_pair);
1147
1148         if (oldcg)
1149                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1150
1151         if (newcg)
1152                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1153
1154         free(arg_directory);
1155         strv_free(arg_controllers);
1156         free(oldcg);
1157         free(newcg);
1158
1159         return r;
1160 }