chiark / gitweb /
90c8b94248b406c3baefca74fd272e2a6c609462
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
59
60 static int help(void) {
61
62         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64                "  -h --help             Show this help\n"
65                "  -D --directory=NAME   Root directory for the container\n"
66                "  -u --user=USER        Run the command under specified user or uid\n"
67                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68                "     --private-network  Disable network in container\n",
69                program_invocation_short_name);
70
71         return 0;
72 }
73
74 static int parse_argv(int argc, char *argv[]) {
75
76         enum {
77                 ARG_PRIVATE_NETWORK = 0x100
78         };
79
80         static const struct option options[] = {
81                 { "help",            no_argument,       NULL, 'h'                 },
82                 { "directory",       required_argument, NULL, 'D'                 },
83                 { "user",            required_argument, NULL, 'u'                 },
84                 { "controllers",     required_argument, NULL, 'C'                 },
85                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
86                 { NULL,              0,                 NULL, 0                   }
87         };
88
89         int c;
90
91         assert(argc >= 0);
92         assert(argv);
93
94         while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
95
96                 switch (c) {
97
98                 case 'h':
99                         help();
100                         return 0;
101
102                 case 'D':
103                         free(arg_directory);
104                         if (!(arg_directory = strdup(optarg))) {
105                                 log_error("Failed to duplicate root directory.");
106                                 return -ENOMEM;
107                         }
108
109                         break;
110
111                 case 'u':
112                         free(arg_user);
113                         if (!(arg_user = strdup(optarg))) {
114                                 log_error("Failed to duplicate user name.");
115                                 return -ENOMEM;
116                         }
117
118                         break;
119
120                 case 'C':
121                         strv_free(arg_controllers);
122                         arg_controllers = strv_split(optarg, ",");
123                         if (!arg_controllers) {
124                                 log_error("Failed to split controllers list.");
125                                 return -ENOMEM;
126                         }
127                         strv_uniq(arg_controllers);
128
129                         break;
130
131                 case ARG_PRIVATE_NETWORK:
132                         arg_private_network = true;
133                         break;
134
135                 case '?':
136                         return -EINVAL;
137
138                 default:
139                         log_error("Unknown option code %c", c);
140                         return -EINVAL;
141                 }
142         }
143
144         return 1;
145 }
146
147 static int mount_all(const char *dest) {
148
149         typedef struct MountPoint {
150                 const char *what;
151                 const char *where;
152                 const char *type;
153                 const char *options;
154                 unsigned long flags;
155                 bool fatal;
156         } MountPoint;
157
158         static const MountPoint mount_table[] = {
159                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
160                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
161                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
162                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
163                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
164                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
165                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
166                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
167 #ifdef HAVE_SELINUX
168                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
169                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
170 #endif
171         };
172
173         unsigned k;
174         int r = 0;
175         char *where;
176
177         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
178                 int t;
179
180                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
181                         log_error("Out of memory");
182
183                         if (r == 0)
184                                 r = -ENOMEM;
185
186                         break;
187                 }
188
189                 t = path_is_mount_point(where, false);
190                 if (t < 0) {
191                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
192                         free(where);
193
194                         if (r == 0)
195                                 r = t;
196
197                         continue;
198                 }
199
200                 mkdir_p(where, 0755);
201
202                 if (mount(mount_table[k].what,
203                           where,
204                           mount_table[k].type,
205                           mount_table[k].flags,
206                           mount_table[k].options) < 0 &&
207                     mount_table[k].fatal) {
208
209                         log_error("mount(%s) failed: %m", where);
210
211                         if (r == 0)
212                                 r = -errno;
213                 }
214
215                 free(where);
216         }
217
218         return r;
219 }
220
221 static int setup_timezone(const char *dest) {
222         char *where;
223
224         assert(dest);
225
226         /* Fix the timezone, if possible */
227         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
228                 log_error("Out of memory");
229                 return -ENOMEM;
230         }
231
232         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
233                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
234
235         free(where);
236
237         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
238                 log_error("Out of memory");
239                 return -ENOMEM;
240         }
241
242         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
243                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
244
245         free(where);
246
247         return 0;
248 }
249
250 static int copy_devnodes(const char *dest) {
251
252         static const char devnodes[] =
253                 "null\0"
254                 "zero\0"
255                 "full\0"
256                 "random\0"
257                 "urandom\0"
258                 "tty\0"
259                 "ptmx\0"
260                 "rtc0\0";
261
262         const char *d;
263         int r = 0;
264         mode_t u;
265
266         assert(dest);
267
268         u = umask(0000);
269
270         NULSTR_FOREACH(d, devnodes) {
271                 struct stat st;
272                 char *from = NULL, *to = NULL;
273
274                 asprintf(&from, "/dev/%s", d);
275                 asprintf(&to, "%s/dev/%s", dest, d);
276
277                 if (!from || !to) {
278                         log_error("Failed to allocate devnode path");
279
280                         free(from);
281                         free(to);
282
283                         from = to = NULL;
284
285                         if (r == 0)
286                                 r = -ENOMEM;
287
288                         break;
289                 }
290
291                 if (stat(from, &st) < 0) {
292
293                         if (errno != ENOENT) {
294                                 log_error("Failed to stat %s: %m", from);
295                                 if (r == 0)
296                                         r = -errno;
297                         }
298
299                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
300
301                         log_error("%s is not a char or block device, cannot copy.", from);
302                         if (r == 0)
303                                 r = -EIO;
304
305                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
306
307                         log_error("mknod(%s) failed: %m", dest);
308                         if (r == 0)
309                                 r = -errno;
310                 }
311
312                 free(from);
313                 free(to);
314         }
315
316         umask(u);
317
318         return r;
319 }
320
321 static int setup_dev_console(const char *dest, const char *console) {
322         struct stat st;
323         char *to = NULL;
324         int r;
325         mode_t u;
326
327         assert(dest);
328         assert(console);
329
330         u = umask(0000);
331
332         if (stat(console, &st) < 0) {
333                 log_error("Failed to stat %s: %m", console);
334                 r = -errno;
335                 goto finish;
336
337         } else if (!S_ISCHR(st.st_mode)) {
338                 log_error("/dev/console is not a char device.");
339                 r = -EIO;
340                 goto finish;
341         }
342
343         r = chmod_and_chown(console, 0600, 0, 0);
344         if (r < 0) {
345                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
346                 goto finish;
347         }
348
349         if (asprintf(&to, "%s/dev/console", dest) < 0) {
350                 log_error("Out of memory");
351                 r = -ENOMEM;
352                 goto finish;
353         }
354
355         /* We need to bind mount the right tty to /dev/console since
356          * ptys can only exist on pts file systems. To have something
357          * to bind mount things on we create a device node first, that
358          * has the right major/minor (note that the major minor
359          * doesn't actually matter here, since we mount it over
360          * anyway). */
361
362         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
363                 log_error("mknod() for /dev/console failed: %m");
364                 r = -errno;
365                 goto finish;
366         }
367
368         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
369                 log_error("Bind mount for /dev/console failed: %m");
370                 r = -errno;
371                 goto finish;
372         }
373
374 finish:
375         free(to);
376         umask(u);
377
378         return r;
379 }
380
381 static int setup_kmsg(const char *dest, int kmsg_socket) {
382         char *from = NULL, *to = NULL;
383         int r, fd, k;
384         mode_t u;
385         union {
386                 struct cmsghdr cmsghdr;
387                 uint8_t buf[CMSG_SPACE(sizeof(int))];
388         } control;
389         struct msghdr mh;
390         struct cmsghdr *cmsg;
391
392         assert(dest);
393         assert(kmsg_socket >= 0);
394
395         u = umask(0000);
396
397         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
398                 log_error("Out of memory");
399                 r = -ENOMEM;
400                 goto finish;
401         }
402
403         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
404                 log_error("Out of memory");
405                 r = -ENOMEM;
406                 goto finish;
407         }
408
409         if (mkfifo(from, 0600) < 0) {
410                 log_error("mkfifo() for /dev/kmsg failed: %m");
411                 r = -errno;
412                 goto finish;
413         }
414
415         r = chmod_and_chown(from, 0600, 0, 0);
416         if (r < 0) {
417                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
418                 goto finish;
419         }
420
421         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
422                 log_error("Bind mount for /proc/kmsg failed: %m");
423                 r = -errno;
424                 goto finish;
425         }
426
427         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
428         if (fd < 0) {
429                 log_error("Failed to open fifo: %m");
430                 r = -errno;
431                 goto finish;
432         }
433
434         zero(mh);
435         zero(control);
436
437         mh.msg_control = &control;
438         mh.msg_controllen = sizeof(control);
439
440         cmsg = CMSG_FIRSTHDR(&mh);
441         cmsg->cmsg_level = SOL_SOCKET;
442         cmsg->cmsg_type = SCM_RIGHTS;
443         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
444         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
445
446         mh.msg_controllen = cmsg->cmsg_len;
447
448         /* Store away the fd in the socket, so that it stays open as
449          * long as we run the child */
450         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
451         close_nointr_nofail(fd);
452
453         if (k < 0) {
454                 log_error("Failed to send FIFO fd: %m");
455                 r = -errno;
456                 goto finish;
457         }
458
459 finish:
460         free(from);
461         free(to);
462         umask(u);
463
464         return r;
465 }
466
467 static int drop_capabilities(void) {
468         static const unsigned long retain[] = {
469                 CAP_CHOWN,
470                 CAP_DAC_OVERRIDE,
471                 CAP_DAC_READ_SEARCH,
472                 CAP_FOWNER,
473                 CAP_FSETID,
474                 CAP_IPC_OWNER,
475                 CAP_KILL,
476                 CAP_LEASE,
477                 CAP_LINUX_IMMUTABLE,
478                 CAP_NET_BIND_SERVICE,
479                 CAP_NET_BROADCAST,
480                 CAP_NET_RAW,
481                 CAP_SETGID,
482                 CAP_SETFCAP,
483                 CAP_SETPCAP,
484                 CAP_SETUID,
485                 CAP_SYS_ADMIN,
486                 CAP_SYS_CHROOT,
487                 CAP_SYS_NICE,
488                 CAP_SYS_PTRACE,
489                 CAP_SYS_TTY_CONFIG
490         };
491
492         unsigned long l;
493
494         for (l = 0; l <= cap_last_cap(); l++) {
495                 unsigned i;
496
497                 for (i = 0; i < ELEMENTSOF(retain); i++)
498                         if (retain[i] == l)
499                                 break;
500
501                 if (i < ELEMENTSOF(retain))
502                         continue;
503
504                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
505                         log_error("PR_CAPBSET_DROP failed: %m");
506                         return -errno;
507                 }
508         }
509
510         return 0;
511 }
512
513 static int is_os_tree(const char *path) {
514         int r;
515         char *p;
516         /* We use /bin/sh as flag file if something is an OS */
517
518         if (asprintf(&p, "%s/bin/sh", path) < 0)
519                 return -ENOMEM;
520
521         r = access(p, F_OK);
522         free(p);
523
524         return r < 0 ? 0 : 1;
525 }
526
527 static int process_pty(int master, sigset_t *mask) {
528
529         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
530         size_t in_buffer_full = 0, out_buffer_full = 0;
531         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
532         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
533         int ep = -1, signal_fd = -1, r;
534
535         fd_nonblock(STDIN_FILENO, 1);
536         fd_nonblock(STDOUT_FILENO, 1);
537         fd_nonblock(master, 1);
538
539         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
540                 log_error("signalfd(): %m");
541                 r = -errno;
542                 goto finish;
543         }
544
545         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
546                 log_error("Failed to create epoll: %m");
547                 r = -errno;
548                 goto finish;
549         }
550
551         zero(stdin_ev);
552         stdin_ev.events = EPOLLIN|EPOLLET;
553         stdin_ev.data.fd = STDIN_FILENO;
554
555         zero(stdout_ev);
556         stdout_ev.events = EPOLLOUT|EPOLLET;
557         stdout_ev.data.fd = STDOUT_FILENO;
558
559         zero(master_ev);
560         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
561         master_ev.data.fd = master;
562
563         zero(signal_ev);
564         signal_ev.events = EPOLLIN;
565         signal_ev.data.fd = signal_fd;
566
567         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
568             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
569             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
570             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
571                 log_error("Failed to regiser fds in epoll: %m");
572                 r = -errno;
573                 goto finish;
574         }
575
576         for (;;) {
577                 struct epoll_event ev[16];
578                 ssize_t k;
579                 int i, nfds;
580
581                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
582
583                         if (errno == EINTR || errno == EAGAIN)
584                                 continue;
585
586                         log_error("epoll_wait(): %m");
587                         r = -errno;
588                         goto finish;
589                 }
590
591                 assert(nfds >= 1);
592
593                 for (i = 0; i < nfds; i++) {
594                         if (ev[i].data.fd == STDIN_FILENO) {
595
596                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
597                                         stdin_readable = true;
598
599                         } else if (ev[i].data.fd == STDOUT_FILENO) {
600
601                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
602                                         stdout_writable = true;
603
604                         } else if (ev[i].data.fd == master) {
605
606                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
607                                         master_readable = true;
608
609                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
610                                         master_writable = true;
611
612                         } else if (ev[i].data.fd == signal_fd) {
613                                 struct signalfd_siginfo sfsi;
614                                 ssize_t n;
615
616                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
617
618                                         if (n >= 0) {
619                                                 log_error("Failed to read from signalfd: invalid block size");
620                                                 r = -EIO;
621                                                 goto finish;
622                                         }
623
624                                         if (errno != EINTR && errno != EAGAIN) {
625                                                 log_error("Failed to read from signalfd: %m");
626                                                 r = -errno;
627                                                 goto finish;
628                                         }
629                                 } else {
630
631                                         if (sfsi.ssi_signo == SIGWINCH) {
632                                                 struct winsize ws;
633
634                                                 /* The window size changed, let's forward that. */
635                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
636                                                         ioctl(master, TIOCSWINSZ, &ws);
637                                         } else {
638                                                 r = 0;
639                                                 goto finish;
640                                         }
641                                 }
642                         }
643                 }
644
645                 while ((stdin_readable && in_buffer_full <= 0) ||
646                        (master_writable && in_buffer_full > 0) ||
647                        (master_readable && out_buffer_full <= 0) ||
648                        (stdout_writable && out_buffer_full > 0)) {
649
650                         if (stdin_readable && in_buffer_full < LINE_MAX) {
651
652                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
653
654                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
655                                                 stdin_readable = false;
656                                         else {
657                                                 log_error("read(): %m");
658                                                 r = -errno;
659                                                 goto finish;
660                                         }
661                                 } else
662                                         in_buffer_full += (size_t) k;
663                         }
664
665                         if (master_writable && in_buffer_full > 0) {
666
667                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
668
669                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
670                                                 master_writable = false;
671                                         else {
672                                                 log_error("write(): %m");
673                                                 r = -errno;
674                                                 goto finish;
675                                         }
676
677                                 } else {
678                                         assert(in_buffer_full >= (size_t) k);
679                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
680                                         in_buffer_full -= k;
681                                 }
682                         }
683
684                         if (master_readable && out_buffer_full < LINE_MAX) {
685
686                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
687
688                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
689                                                 master_readable = false;
690                                         else {
691                                                 log_error("read(): %m");
692                                                 r = -errno;
693                                                 goto finish;
694                                         }
695                                 }  else
696                                         out_buffer_full += (size_t) k;
697                         }
698
699                         if (stdout_writable && out_buffer_full > 0) {
700
701                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
702
703                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
704                                                 stdout_writable = false;
705                                         else {
706                                                 log_error("write(): %m");
707                                                 r = -errno;
708                                                 goto finish;
709                                         }
710
711                                 } else {
712                                         assert(out_buffer_full >= (size_t) k);
713                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
714                                         out_buffer_full -= k;
715                                 }
716                         }
717                 }
718         }
719
720 finish:
721         if (ep >= 0)
722                 close_nointr_nofail(ep);
723
724         if (signal_fd >= 0)
725                 close_nointr_nofail(signal_fd);
726
727         return r;
728 }
729
730 int main(int argc, char *argv[]) {
731         pid_t pid = 0;
732         int r = EXIT_FAILURE, k;
733         char *oldcg = NULL, *newcg = NULL;
734         char **controller = NULL;
735         int master = -1;
736         const char *console = NULL;
737         struct termios saved_attr, raw_attr;
738         sigset_t mask;
739         bool saved_attr_valid = false;
740         struct winsize ws;
741         int kmsg_socket_pair[2] = { -1, -1 };
742
743         log_parse_environment();
744         log_open();
745
746         if ((r = parse_argv(argc, argv)) <= 0)
747                 goto finish;
748
749         if (arg_directory) {
750                 char *p;
751
752                 p = path_make_absolute_cwd(arg_directory);
753                 free(arg_directory);
754                 arg_directory = p;
755         } else
756                 arg_directory = get_current_dir_name();
757
758         if (!arg_directory) {
759                 log_error("Failed to determine path");
760                 goto finish;
761         }
762
763         path_kill_slashes(arg_directory);
764
765         if (geteuid() != 0) {
766                 log_error("Need to be root.");
767                 goto finish;
768         }
769
770         if (sd_booted() <= 0) {
771                 log_error("Not running on a systemd system.");
772                 goto finish;
773         }
774
775         if (path_equal(arg_directory, "/")) {
776                 log_error("Spawning container on root directory not supported.");
777                 goto finish;
778         }
779
780         if (is_os_tree(arg_directory) <= 0) {
781                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
782                 goto finish;
783         }
784
785         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
786                 log_error("Failed to determine current cgroup: %s", strerror(-k));
787                 goto finish;
788         }
789
790         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
791                 log_error("Failed to allocate cgroup path.");
792                 goto finish;
793         }
794
795         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
796         if (k < 0)  {
797                 log_error("Failed to create cgroup: %s", strerror(-k));
798                 goto finish;
799         }
800
801         STRV_FOREACH(controller,arg_controllers) {
802                 k = cg_create_and_attach(*controller, newcg, 0);
803                 if (k < 0)
804                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
805         }
806
807         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
808                 log_error("Failed to acquire pseudo tty: %m");
809                 goto finish;
810         }
811
812         if (!(console = ptsname(master))) {
813                 log_error("Failed to determine tty name: %m");
814                 goto finish;
815         }
816
817         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
818
819         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
820                 ioctl(master, TIOCSWINSZ, &ws);
821
822         if (unlockpt(master) < 0) {
823                 log_error("Failed to unlock tty: %m");
824                 goto finish;
825         }
826
827         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
828                 log_error("Failed to get terminal attributes: %m");
829                 goto finish;
830         }
831
832         saved_attr_valid = true;
833
834         raw_attr = saved_attr;
835         cfmakeraw(&raw_attr);
836         raw_attr.c_lflag &= ~ECHO;
837
838         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
839                 log_error("Failed to set terminal attributes: %m");
840                 goto finish;
841         }
842
843         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
844                 log_error("Failed to create kmsg socket pair");
845                 goto finish;
846         }
847
848         assert_se(sigemptyset(&mask) == 0);
849         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
850         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
851
852         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
853         if (pid < 0) {
854                 if (errno == EINVAL)
855                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
856                 else
857                         log_error("clone() failed: %m");
858
859                 goto finish;
860         }
861
862         if (pid == 0) {
863                 /* child */
864
865                 const char *hn;
866                 const char *home = NULL;
867                 uid_t uid = (uid_t) -1;
868                 gid_t gid = (gid_t) -1;
869                 const char *envp[] = {
870                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
871                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
872                         NULL, /* TERM */
873                         NULL, /* HOME */
874                         NULL, /* USER */
875                         NULL, /* LOGNAME */
876                         NULL
877                 };
878
879                 envp[2] = strv_find_prefix(environ, "TERM=");
880
881                 close_nointr_nofail(master);
882
883                 close_nointr(STDIN_FILENO);
884                 close_nointr(STDOUT_FILENO);
885                 close_nointr(STDERR_FILENO);
886
887                 close_all_fds(&kmsg_socket_pair[1], 1);
888
889                 reset_all_signal_handlers();
890
891                 assert_se(sigemptyset(&mask) == 0);
892                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
893
894                 if (setsid() < 0)
895                         goto child_fail;
896
897                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
898                         goto child_fail;
899
900                 /* Mark / as private, in case somebody marked it shared */
901                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
902                         goto child_fail;
903
904                 if (mount_all(arg_directory) < 0)
905                         goto child_fail;
906
907                 if (copy_devnodes(arg_directory) < 0)
908                         goto child_fail;
909
910                 if (setup_dev_console(arg_directory, console) < 0)
911                         goto child_fail;
912
913                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
914                         goto child_fail;
915
916                 close_nointr_nofail(kmsg_socket_pair[1]);
917
918                 if (setup_timezone(arg_directory) < 0)
919                         goto child_fail;
920
921                 if (chdir(arg_directory) < 0) {
922                         log_error("chdir(%s) failed: %m", arg_directory);
923                         goto child_fail;
924                 }
925
926                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
927                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
928                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
929                         goto child_fail;
930
931                 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
932                         log_error("mount(MS_MOVE) failed: %m");
933                         goto child_fail;
934                 }
935
936                 if (chroot(".") < 0) {
937                         log_error("chroot() failed: %m");
938                         goto child_fail;
939                 }
940
941                 if (chdir("/") < 0) {
942                         log_error("chdir() failed: %m");
943                         goto child_fail;
944                 }
945
946                 umask(0022);
947
948                 loopback_setup();
949
950                 if (drop_capabilities() < 0)
951                         goto child_fail;
952
953                 if (arg_user) {
954
955                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
956                                 log_error("get_user_creds() failed: %m");
957                                 goto child_fail;
958                         }
959
960                         if (mkdir_parents(home, 0775) < 0) {
961                                 log_error("mkdir_parents() failed: %m");
962                                 goto child_fail;
963                         }
964
965                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
966                                 log_error("safe_mkdir() failed: %m");
967                                 goto child_fail;
968                         }
969
970                         if (initgroups((const char*)arg_user, gid) < 0) {
971                                 log_error("initgroups() failed: %m");
972                                 goto child_fail;
973                         }
974
975                         if (setresgid(gid, gid, gid) < 0) {
976                                 log_error("setregid() failed: %m");
977                                 goto child_fail;
978                         }
979
980                         if (setresuid(uid, uid, uid) < 0) {
981                                 log_error("setreuid() failed: %m");
982                                 goto child_fail;
983                         }
984                 }
985
986                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
987                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
988                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
989                     log_error("Out of memory");
990                     goto child_fail;
991                 }
992
993                 if ((hn = file_name_from_path(arg_directory)))
994                         sethostname(hn, strlen(hn));
995
996                 if (argc > optind)
997                         execvpe(argv[optind], argv + optind, (char**) envp);
998                 else {
999                         chdir(home ? home : "/root");
1000                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1001                 }
1002
1003                 log_error("execv() failed: %m");
1004
1005         child_fail:
1006                 _exit(EXIT_FAILURE);
1007         }
1008
1009         if (process_pty(master, &mask) < 0)
1010                 goto finish;
1011
1012         if (saved_attr_valid) {
1013                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1014                 saved_attr_valid = false;
1015         }
1016
1017         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1018
1019         if (r < 0)
1020                 r = EXIT_FAILURE;
1021
1022 finish:
1023         if (saved_attr_valid)
1024                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1025
1026         if (master >= 0)
1027                 close_nointr_nofail(master);
1028
1029         close_pipe(kmsg_socket_pair);
1030
1031         if (oldcg)
1032                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1033
1034         if (newcg)
1035                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1036
1037         free(arg_directory);
1038         strv_free(arg_controllers);
1039         free(oldcg);
1040         free(newcg);
1041
1042         return r;
1043 }