chiark / gitweb /
9fc256e51d073f73d34664e4139b5a4cda36f41a
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41
42 #include <systemd/sd-daemon.h>
43
44 #include "log.h"
45 #include "util.h"
46 #include "mkdir.h"
47 #include "audit.h"
48 #include "missing.h"
49 #include "cgroup-util.h"
50 #include "strv.h"
51 #include "loopback-setup.h"
52
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static char **arg_controllers = NULL;
56 static bool arg_private_network = false;
57
58 static int help(void) {
59
60         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
61                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
62                "  -h --help             Show this help\n"
63                "  -D --directory=NAME   Root directory for the container\n"
64                "  -u --user=USER        Run the command under specified user or uid\n"
65                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
66                "     --private-network  Disable network in container\n",
67                program_invocation_short_name);
68
69         return 0;
70 }
71
72 static int parse_argv(int argc, char *argv[]) {
73
74         enum {
75                 ARG_PRIVATE_NETWORK = 0x100
76         };
77
78         static const struct option options[] = {
79                 { "help",            no_argument,       NULL, 'h'                 },
80                 { "directory",       required_argument, NULL, 'D'                 },
81                 { "user",            required_argument, NULL, 'u'                 },
82                 { "controllers",     required_argument, NULL, 'C'                 },
83                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
84                 { NULL,              0,                 NULL, 0                   }
85         };
86
87         int c;
88
89         assert(argc >= 0);
90         assert(argv);
91
92         while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
93
94                 switch (c) {
95
96                 case 'h':
97                         help();
98                         return 0;
99
100                 case 'D':
101                         free(arg_directory);
102                         if (!(arg_directory = strdup(optarg))) {
103                                 log_error("Failed to duplicate root directory.");
104                                 return -ENOMEM;
105                         }
106
107                         break;
108
109                 case 'u':
110                         free(arg_user);
111                         if (!(arg_user = strdup(optarg))) {
112                                 log_error("Failed to duplicate user name.");
113                                 return -ENOMEM;
114                         }
115
116                         break;
117
118                 case 'C':
119                         strv_free(arg_controllers);
120                         arg_controllers = strv_split(optarg, ",");
121                         if (!arg_controllers) {
122                                 log_error("Failed to split controllers list.");
123                                 return -ENOMEM;
124                         }
125                         strv_uniq(arg_controllers);
126
127                         break;
128
129                 case ARG_PRIVATE_NETWORK:
130                         arg_private_network = true;
131                         break;
132
133                 case '?':
134                         return -EINVAL;
135
136                 default:
137                         log_error("Unknown option code %c", c);
138                         return -EINVAL;
139                 }
140         }
141
142         return 1;
143 }
144
145 static int mount_all(const char *dest) {
146
147         typedef struct MountPoint {
148                 const char *what;
149                 const char *where;
150                 const char *type;
151                 const char *options;
152                 unsigned long flags;
153                 bool fatal;
154         } MountPoint;
155
156         static const MountPoint mount_table[] = {
157                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
158                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
159                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
160                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
161                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
162                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
163                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
164                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
165 #ifdef HAVE_SELINUX
166                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
167                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
168 #endif
169         };
170
171         unsigned k;
172         int r = 0;
173         char *where;
174
175         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
176                 int t;
177
178                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
179                         log_error("Out of memory");
180
181                         if (r == 0)
182                                 r = -ENOMEM;
183
184                         break;
185                 }
186
187                 t = path_is_mount_point(where, false);
188                 if (t < 0) {
189                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
190                         free(where);
191
192                         if (r == 0)
193                                 r = t;
194
195                         continue;
196                 }
197
198                 mkdir_p(where, 0755);
199
200                 if (mount(mount_table[k].what,
201                           where,
202                           mount_table[k].type,
203                           mount_table[k].flags,
204                           mount_table[k].options) < 0 &&
205                     mount_table[k].fatal) {
206
207                         log_error("mount(%s) failed: %m", where);
208
209                         if (r == 0)
210                                 r = -errno;
211                 }
212
213                 free(where);
214         }
215
216         return r;
217 }
218
219 static int setup_timezone(const char *dest) {
220         char *where;
221
222         assert(dest);
223
224         /* Fix the timezone, if possible */
225         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
226                 log_error("Out of memory");
227                 return -ENOMEM;
228         }
229
230         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
231                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
232
233         free(where);
234
235         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
236                 log_error("Out of memory");
237                 return -ENOMEM;
238         }
239
240         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
241                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
242
243         free(where);
244
245         return 0;
246 }
247
248 static int copy_devnodes(const char *dest) {
249
250         static const char devnodes[] =
251                 "null\0"
252                 "zero\0"
253                 "full\0"
254                 "random\0"
255                 "urandom\0"
256                 "tty\0"
257                 "ptmx\0"
258                 "rtc0\0";
259
260         const char *d;
261         int r = 0;
262         mode_t u;
263
264         assert(dest);
265
266         u = umask(0000);
267
268         NULSTR_FOREACH(d, devnodes) {
269                 struct stat st;
270                 char *from = NULL, *to = NULL;
271
272                 asprintf(&from, "/dev/%s", d);
273                 asprintf(&to, "%s/dev/%s", dest, d);
274
275                 if (!from || !to) {
276                         log_error("Failed to allocate devnode path");
277
278                         free(from);
279                         free(to);
280
281                         from = to = NULL;
282
283                         if (r == 0)
284                                 r = -ENOMEM;
285
286                         break;
287                 }
288
289                 if (stat(from, &st) < 0) {
290
291                         if (errno != ENOENT) {
292                                 log_error("Failed to stat %s: %m", from);
293                                 if (r == 0)
294                                         r = -errno;
295                         }
296
297                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
298
299                         log_error("%s is not a char or block device, cannot copy.", from);
300                         if (r == 0)
301                                 r = -EIO;
302
303                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
304
305                         log_error("mknod(%s) failed: %m", dest);
306                         if (r == 0)
307                                 r = -errno;
308                 }
309
310                 free(from);
311                 free(to);
312         }
313
314         umask(u);
315
316         return r;
317 }
318
319 static int setup_dev_console(const char *dest, const char *console) {
320         struct stat st;
321         char *to = NULL;
322         int r;
323         mode_t u;
324
325         assert(dest);
326         assert(console);
327
328         u = umask(0000);
329
330         if (stat(console, &st) < 0) {
331                 log_error("Failed to stat %s: %m", console);
332                 r = -errno;
333                 goto finish;
334
335         } else if (!S_ISCHR(st.st_mode)) {
336                 log_error("/dev/console is not a char device.");
337                 r = -EIO;
338                 goto finish;
339         }
340
341         r = chmod_and_chown(console, 0600, 0, 0);
342         if (r < 0) {
343                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
344                 goto finish;
345         }
346
347         if (asprintf(&to, "%s/dev/console", dest) < 0) {
348                 log_error("Out of memory");
349                 r = -ENOMEM;
350                 goto finish;
351         }
352
353         /* We need to bind mount the right tty to /dev/console since
354          * ptys can only exist on pts file systems. To have something
355          * to bind mount things on we create a device node first, that
356          * has the right major/minor (note that the major minor
357          * doesn't actually matter here, since we mount it over
358          * anyway). */
359
360         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
361                 log_error("mknod() for /dev/console failed: %m");
362                 r = -errno;
363                 goto finish;
364         }
365
366         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
367                 log_error("Bind mount for /dev/console failed: %m");
368                 r = -errno;
369                 goto finish;
370         }
371
372 finish:
373         free(to);
374         umask(u);
375
376         return r;
377 }
378
379 static int setup_kmsg(const char *dest, int kmsg_socket) {
380         char *from = NULL, *to = NULL;
381         int r, fd, k;
382         mode_t u;
383         union {
384                 struct cmsghdr cmsghdr;
385                 uint8_t buf[CMSG_SPACE(sizeof(int))];
386         } control;
387         struct msghdr mh;
388         struct cmsghdr *cmsg;
389
390         assert(dest);
391         assert(kmsg_socket >= 0);
392
393         u = umask(0000);
394
395         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
396                 log_error("Out of memory");
397                 r = -ENOMEM;
398                 goto finish;
399         }
400
401         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
402                 log_error("Out of memory");
403                 r = -ENOMEM;
404                 goto finish;
405         }
406
407         if (mkfifo(from, 0600) < 0) {
408                 log_error("mkfifo() for /dev/kmsg failed: %m");
409                 r = -errno;
410                 goto finish;
411         }
412
413         r = chmod_and_chown(from, 0600, 0, 0);
414         if (r < 0) {
415                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
416                 goto finish;
417         }
418
419         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
420                 log_error("Bind mount for /proc/kmsg failed: %m");
421                 r = -errno;
422                 goto finish;
423         }
424
425         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
426         if (fd < 0) {
427                 log_error("Failed to open fifo: %m");
428                 r = -errno;
429                 goto finish;
430         }
431
432         zero(mh);
433         zero(control);
434
435         mh.msg_control = &control;
436         mh.msg_controllen = sizeof(control);
437
438         cmsg = CMSG_FIRSTHDR(&mh);
439         cmsg->cmsg_level = SOL_SOCKET;
440         cmsg->cmsg_type = SCM_RIGHTS;
441         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
442         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
443
444         mh.msg_controllen = cmsg->cmsg_len;
445
446         /* Store away the fd in the socket, so that it stays open as
447          * long as we run the child */
448         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
449         close_nointr_nofail(fd);
450
451         if (k < 0) {
452                 log_error("Failed to send FIFO fd: %m");
453                 r = -errno;
454                 goto finish;
455         }
456
457 finish:
458         free(from);
459         free(to);
460         umask(u);
461
462         return r;
463 }
464
465 static int drop_capabilities(void) {
466         static const unsigned long retain[] = {
467                 CAP_CHOWN,
468                 CAP_DAC_OVERRIDE,
469                 CAP_DAC_READ_SEARCH,
470                 CAP_FOWNER,
471                 CAP_FSETID,
472                 CAP_IPC_OWNER,
473                 CAP_KILL,
474                 CAP_LEASE,
475                 CAP_LINUX_IMMUTABLE,
476                 CAP_NET_BIND_SERVICE,
477                 CAP_NET_BROADCAST,
478                 CAP_NET_RAW,
479                 CAP_SETGID,
480                 CAP_SETFCAP,
481                 CAP_SETPCAP,
482                 CAP_SETUID,
483                 CAP_SYS_ADMIN,
484                 CAP_SYS_CHROOT,
485                 CAP_SYS_NICE,
486                 CAP_SYS_PTRACE,
487                 CAP_SYS_TTY_CONFIG
488         };
489
490         unsigned long l;
491
492         for (l = 0; l <= cap_last_cap(); l++) {
493                 unsigned i;
494
495                 for (i = 0; i < ELEMENTSOF(retain); i++)
496                         if (retain[i] == l)
497                                 break;
498
499                 if (i < ELEMENTSOF(retain))
500                         continue;
501
502                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
503                         log_error("PR_CAPBSET_DROP failed: %m");
504                         return -errno;
505                 }
506         }
507
508         return 0;
509 }
510
511 static int is_os_tree(const char *path) {
512         int r;
513         char *p;
514         /* We use /bin/sh as flag file if something is an OS */
515
516         if (asprintf(&p, "%s/bin/sh", path) < 0)
517                 return -ENOMEM;
518
519         r = access(p, F_OK);
520         free(p);
521
522         return r < 0 ? 0 : 1;
523 }
524
525 static int process_pty(int master, sigset_t *mask) {
526
527         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
528         size_t in_buffer_full = 0, out_buffer_full = 0;
529         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
530         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
531         int ep = -1, signal_fd = -1, r;
532
533         fd_nonblock(STDIN_FILENO, 1);
534         fd_nonblock(STDOUT_FILENO, 1);
535         fd_nonblock(master, 1);
536
537         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
538                 log_error("signalfd(): %m");
539                 r = -errno;
540                 goto finish;
541         }
542
543         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
544                 log_error("Failed to create epoll: %m");
545                 r = -errno;
546                 goto finish;
547         }
548
549         zero(stdin_ev);
550         stdin_ev.events = EPOLLIN|EPOLLET;
551         stdin_ev.data.fd = STDIN_FILENO;
552
553         zero(stdout_ev);
554         stdout_ev.events = EPOLLOUT|EPOLLET;
555         stdout_ev.data.fd = STDOUT_FILENO;
556
557         zero(master_ev);
558         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
559         master_ev.data.fd = master;
560
561         zero(signal_ev);
562         signal_ev.events = EPOLLIN;
563         signal_ev.data.fd = signal_fd;
564
565         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
566             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
567             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
568             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
569                 log_error("Failed to regiser fds in epoll: %m");
570                 r = -errno;
571                 goto finish;
572         }
573
574         for (;;) {
575                 struct epoll_event ev[16];
576                 ssize_t k;
577                 int i, nfds;
578
579                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
580
581                         if (errno == EINTR || errno == EAGAIN)
582                                 continue;
583
584                         log_error("epoll_wait(): %m");
585                         r = -errno;
586                         goto finish;
587                 }
588
589                 assert(nfds >= 1);
590
591                 for (i = 0; i < nfds; i++) {
592                         if (ev[i].data.fd == STDIN_FILENO) {
593
594                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
595                                         stdin_readable = true;
596
597                         } else if (ev[i].data.fd == STDOUT_FILENO) {
598
599                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
600                                         stdout_writable = true;
601
602                         } else if (ev[i].data.fd == master) {
603
604                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
605                                         master_readable = true;
606
607                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
608                                         master_writable = true;
609
610                         } else if (ev[i].data.fd == signal_fd) {
611                                 struct signalfd_siginfo sfsi;
612                                 ssize_t n;
613
614                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
615
616                                         if (n >= 0) {
617                                                 log_error("Failed to read from signalfd: invalid block size");
618                                                 r = -EIO;
619                                                 goto finish;
620                                         }
621
622                                         if (errno != EINTR && errno != EAGAIN) {
623                                                 log_error("Failed to read from signalfd: %m");
624                                                 r = -errno;
625                                                 goto finish;
626                                         }
627                                 } else {
628
629                                         if (sfsi.ssi_signo == SIGWINCH) {
630                                                 struct winsize ws;
631
632                                                 /* The window size changed, let's forward that. */
633                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
634                                                         ioctl(master, TIOCSWINSZ, &ws);
635                                         } else {
636                                                 r = 0;
637                                                 goto finish;
638                                         }
639                                 }
640                         }
641                 }
642
643                 while ((stdin_readable && in_buffer_full <= 0) ||
644                        (master_writable && in_buffer_full > 0) ||
645                        (master_readable && out_buffer_full <= 0) ||
646                        (stdout_writable && out_buffer_full > 0)) {
647
648                         if (stdin_readable && in_buffer_full < LINE_MAX) {
649
650                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
651
652                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
653                                                 stdin_readable = false;
654                                         else {
655                                                 log_error("read(): %m");
656                                                 r = -errno;
657                                                 goto finish;
658                                         }
659                                 } else
660                                         in_buffer_full += (size_t) k;
661                         }
662
663                         if (master_writable && in_buffer_full > 0) {
664
665                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
666
667                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
668                                                 master_writable = false;
669                                         else {
670                                                 log_error("write(): %m");
671                                                 r = -errno;
672                                                 goto finish;
673                                         }
674
675                                 } else {
676                                         assert(in_buffer_full >= (size_t) k);
677                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
678                                         in_buffer_full -= k;
679                                 }
680                         }
681
682                         if (master_readable && out_buffer_full < LINE_MAX) {
683
684                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
685
686                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
687                                                 master_readable = false;
688                                         else {
689                                                 log_error("read(): %m");
690                                                 r = -errno;
691                                                 goto finish;
692                                         }
693                                 }  else
694                                         out_buffer_full += (size_t) k;
695                         }
696
697                         if (stdout_writable && out_buffer_full > 0) {
698
699                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
700
701                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
702                                                 stdout_writable = false;
703                                         else {
704                                                 log_error("write(): %m");
705                                                 r = -errno;
706                                                 goto finish;
707                                         }
708
709                                 } else {
710                                         assert(out_buffer_full >= (size_t) k);
711                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
712                                         out_buffer_full -= k;
713                                 }
714                         }
715                 }
716         }
717
718 finish:
719         if (ep >= 0)
720                 close_nointr_nofail(ep);
721
722         if (signal_fd >= 0)
723                 close_nointr_nofail(signal_fd);
724
725         return r;
726 }
727
728 int main(int argc, char *argv[]) {
729         pid_t pid = 0;
730         int r = EXIT_FAILURE, k;
731         char *oldcg = NULL, *newcg = NULL;
732         char **controller = NULL;
733         int master = -1;
734         const char *console = NULL;
735         struct termios saved_attr, raw_attr;
736         sigset_t mask;
737         bool saved_attr_valid = false;
738         struct winsize ws;
739         int kmsg_socket_pair[2] = { -1, -1 };
740
741         log_parse_environment();
742         log_open();
743
744         if ((r = parse_argv(argc, argv)) <= 0)
745                 goto finish;
746
747         if (arg_directory) {
748                 char *p;
749
750                 p = path_make_absolute_cwd(arg_directory);
751                 free(arg_directory);
752                 arg_directory = p;
753         } else
754                 arg_directory = get_current_dir_name();
755
756         if (!arg_directory) {
757                 log_error("Failed to determine path");
758                 goto finish;
759         }
760
761         path_kill_slashes(arg_directory);
762
763         if (geteuid() != 0) {
764                 log_error("Need to be root.");
765                 goto finish;
766         }
767
768         if (sd_booted() <= 0) {
769                 log_error("Not running on a systemd system.");
770                 goto finish;
771         }
772
773         if (path_equal(arg_directory, "/")) {
774                 log_error("Spawning container on root directory not supported.");
775                 goto finish;
776         }
777
778         if (is_os_tree(arg_directory) <= 0) {
779                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
780                 goto finish;
781         }
782
783         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
784                 log_error("Failed to determine current cgroup: %s", strerror(-k));
785                 goto finish;
786         }
787
788         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
789                 log_error("Failed to allocate cgroup path.");
790                 goto finish;
791         }
792
793         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
794         if (k < 0)  {
795                 log_error("Failed to create cgroup: %s", strerror(-k));
796                 goto finish;
797         }
798
799         STRV_FOREACH(controller,arg_controllers) {
800                 k = cg_create_and_attach(*controller, newcg, 0);
801                 if (k < 0)
802                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
803         }
804
805         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
806                 log_error("Failed to acquire pseudo tty: %m");
807                 goto finish;
808         }
809
810         if (!(console = ptsname(master))) {
811                 log_error("Failed to determine tty name: %m");
812                 goto finish;
813         }
814
815         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
816
817         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
818                 ioctl(master, TIOCSWINSZ, &ws);
819
820         if (unlockpt(master) < 0) {
821                 log_error("Failed to unlock tty: %m");
822                 goto finish;
823         }
824
825         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
826                 log_error("Failed to get terminal attributes: %m");
827                 goto finish;
828         }
829
830         saved_attr_valid = true;
831
832         raw_attr = saved_attr;
833         cfmakeraw(&raw_attr);
834         raw_attr.c_lflag &= ~ECHO;
835
836         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
837                 log_error("Failed to set terminal attributes: %m");
838                 goto finish;
839         }
840
841         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
842                 log_error("Failed to create kmsg socket pair");
843                 goto finish;
844         }
845
846         assert_se(sigemptyset(&mask) == 0);
847         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
848         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
849
850         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
851         if (pid < 0) {
852                 if (errno == EINVAL)
853                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
854                 else
855                         log_error("clone() failed: %m");
856
857                 goto finish;
858         }
859
860         if (pid == 0) {
861                 /* child */
862
863                 const char *hn;
864                 const char *home = NULL;
865                 uid_t uid = (uid_t) -1;
866                 gid_t gid = (gid_t) -1;
867                 const char *envp[] = {
868                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
869                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
870                         NULL, /* TERM */
871                         NULL, /* HOME */
872                         NULL, /* USER */
873                         NULL, /* LOGNAME */
874                         NULL
875                 };
876
877                 envp[2] = strv_find_prefix(environ, "TERM=");
878
879                 close_nointr_nofail(master);
880
881                 close_nointr(STDIN_FILENO);
882                 close_nointr(STDOUT_FILENO);
883                 close_nointr(STDERR_FILENO);
884
885                 close_all_fds(&kmsg_socket_pair[1], 1);
886
887                 reset_all_signal_handlers();
888
889                 assert_se(sigemptyset(&mask) == 0);
890                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
891
892                 if (setsid() < 0)
893                         goto child_fail;
894
895                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
896                         goto child_fail;
897
898                 /* Mark / as private, in case somebody marked it shared */
899                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
900                         goto child_fail;
901
902                 if (mount_all(arg_directory) < 0)
903                         goto child_fail;
904
905                 if (copy_devnodes(arg_directory) < 0)
906                         goto child_fail;
907
908                 if (setup_dev_console(arg_directory, console) < 0)
909                         goto child_fail;
910
911                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
912                         goto child_fail;
913
914                 close_nointr_nofail(kmsg_socket_pair[1]);
915
916                 if (setup_timezone(arg_directory) < 0)
917                         goto child_fail;
918
919                 if (chdir(arg_directory) < 0) {
920                         log_error("chdir(%s) failed: %m", arg_directory);
921                         goto child_fail;
922                 }
923
924                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
925                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
926                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
927                         goto child_fail;
928
929                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
930                         log_error("mount(MS_MOVE) failed: %m");
931                         goto child_fail;
932                 }
933
934                 if (chroot(".") < 0) {
935                         log_error("chroot() failed: %m");
936                         goto child_fail;
937                 }
938
939                 if (chdir("/") < 0) {
940                         log_error("chdir() failed: %m");
941                         goto child_fail;
942                 }
943
944                 umask(0022);
945
946                 loopback_setup();
947
948                 if (drop_capabilities() < 0)
949                         goto child_fail;
950
951                 if (arg_user) {
952
953                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
954                                 log_error("get_user_creds() failed: %m");
955                                 goto child_fail;
956                         }
957
958                         if (mkdir_parents(home, 0775) < 0) {
959                                 log_error("mkdir_parents() failed: %m");
960                                 goto child_fail;
961                         }
962
963                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
964                                 log_error("safe_mkdir() failed: %m");
965                                 goto child_fail;
966                         }
967
968                         if (initgroups((const char*)arg_user, gid) < 0) {
969                                 log_error("initgroups() failed: %m");
970                                 goto child_fail;
971                         }
972
973                         if (setresgid(gid, gid, gid) < 0) {
974                                 log_error("setregid() failed: %m");
975                                 goto child_fail;
976                         }
977
978                         if (setresuid(uid, uid, uid) < 0) {
979                                 log_error("setreuid() failed: %m");
980                                 goto child_fail;
981                         }
982                 }
983
984                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
985                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
986                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
987                     log_error("Out of memory");
988                     goto child_fail;
989                 }
990
991                 if ((hn = file_name_from_path(arg_directory)))
992                         sethostname(hn, strlen(hn));
993
994                 if (argc > optind)
995                         execvpe(argv[optind], argv + optind, (char**) envp);
996                 else {
997                         chdir(home ? home : "/root");
998                         execle("/bin/bash", "-bash", NULL, (char**) envp);
999                 }
1000
1001                 log_error("execv() failed: %m");
1002
1003         child_fail:
1004                 _exit(EXIT_FAILURE);
1005         }
1006
1007         if (process_pty(master, &mask) < 0)
1008                 goto finish;
1009
1010         if (saved_attr_valid) {
1011                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1012                 saved_attr_valid = false;
1013         }
1014
1015         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1016
1017         if (r < 0)
1018                 r = EXIT_FAILURE;
1019
1020 finish:
1021         if (saved_attr_valid)
1022                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1023
1024         if (master >= 0)
1025                 close_nointr_nofail(master);
1026
1027         close_pipe(kmsg_socket_pair);
1028
1029         if (oldcg)
1030                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1031
1032         if (newcg)
1033                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1034
1035         free(arg_directory);
1036         strv_free(arg_controllers);
1037         free(oldcg);
1038         free(newcg);
1039
1040         return r;
1041 }