chiark / gitweb /
71cdd3f39f27e23ab248dc84058ccf437f2f7399
[elogind.git] / src / nspawn / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41 #include <sys/un.h>
42 #include <sys/socket.h>
43
44 #include <systemd/sd-daemon.h>
45
46 #include "log.h"
47 #include "util.h"
48 #include "mkdir.h"
49 #include "audit.h"
50 #include "missing.h"
51 #include "cgroup-util.h"
52 #include "strv.h"
53 #include "loopback-setup.h"
54
55 static char *arg_directory = NULL;
56 static char *arg_user = NULL;
57 static char **arg_controllers = NULL;
58 static bool arg_private_network = false;
59
60 static int help(void) {
61
62         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
63                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
64                "  -h --help             Show this help\n"
65                "  -D --directory=NAME   Root directory for the container\n"
66                "  -u --user=USER        Run the command under specified user or uid\n"
67                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
68                "     --private-network  Disable network in container\n",
69                program_invocation_short_name);
70
71         return 0;
72 }
73
74 static int parse_argv(int argc, char *argv[]) {
75
76         enum {
77                 ARG_PRIVATE_NETWORK = 0x100
78         };
79
80         static const struct option options[] = {
81                 { "help",            no_argument,       NULL, 'h'                 },
82                 { "directory",       required_argument, NULL, 'D'                 },
83                 { "user",            required_argument, NULL, 'u'                 },
84                 { "controllers",     required_argument, NULL, 'C'                 },
85                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
86                 { NULL,              0,                 NULL, 0                   }
87         };
88
89         int c;
90
91         assert(argc >= 0);
92         assert(argv);
93
94         while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
95
96                 switch (c) {
97
98                 case 'h':
99                         help();
100                         return 0;
101
102                 case 'D':
103                         free(arg_directory);
104                         if (!(arg_directory = strdup(optarg))) {
105                                 log_error("Failed to duplicate root directory.");
106                                 return -ENOMEM;
107                         }
108
109                         break;
110
111                 case 'u':
112                         free(arg_user);
113                         if (!(arg_user = strdup(optarg))) {
114                                 log_error("Failed to duplicate user name.");
115                                 return -ENOMEM;
116                         }
117
118                         break;
119
120                 case 'C':
121                         strv_free(arg_controllers);
122                         arg_controllers = strv_split(optarg, ",");
123                         if (!arg_controllers) {
124                                 log_error("Failed to split controllers list.");
125                                 return -ENOMEM;
126                         }
127                         strv_uniq(arg_controllers);
128
129                         break;
130
131                 case ARG_PRIVATE_NETWORK:
132                         arg_private_network = true;
133                         break;
134
135                 case '?':
136                         return -EINVAL;
137
138                 default:
139                         log_error("Unknown option code %c", c);
140                         return -EINVAL;
141                 }
142         }
143
144         return 1;
145 }
146
147 static int mount_all(const char *dest) {
148
149         typedef struct MountPoint {
150                 const char *what;
151                 const char *where;
152                 const char *type;
153                 const char *options;
154                 unsigned long flags;
155                 bool fatal;
156         } MountPoint;
157
158         static const MountPoint mount_table[] = {
159                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
160                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
161                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
162                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
163                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
164                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
165                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
166                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
167 #ifdef HAVE_SELINUX
168                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
169                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
170 #endif
171         };
172
173         unsigned k;
174         int r = 0;
175         char *where;
176
177         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
178                 int t;
179
180                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
181                         log_error("Out of memory");
182
183                         if (r == 0)
184                                 r = -ENOMEM;
185
186                         break;
187                 }
188
189                 t = path_is_mount_point(where, false);
190                 if (t < 0) {
191                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
192                         free(where);
193
194                         if (r == 0)
195                                 r = t;
196
197                         continue;
198                 }
199
200                 mkdir_p(where, 0755);
201
202                 if (mount(mount_table[k].what,
203                           where,
204                           mount_table[k].type,
205                           mount_table[k].flags,
206                           mount_table[k].options) < 0 &&
207                     mount_table[k].fatal) {
208
209                         log_error("mount(%s) failed: %m", where);
210
211                         if (r == 0)
212                                 r = -errno;
213                 }
214
215                 free(where);
216         }
217
218         return r;
219 }
220
221 static int setup_timezone(const char *dest) {
222         char *where;
223
224         assert(dest);
225
226         /* Fix the timezone, if possible */
227         if (asprintf(&where, "%s/etc/localtime", dest) < 0) {
228                 log_error("Out of memory");
229                 return -ENOMEM;
230         }
231
232         if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
233                 mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
234
235         free(where);
236
237         if (asprintf(&where, "%s/etc/timezone", dest) < 0) {
238                 log_error("Out of memory");
239                 return -ENOMEM;
240         }
241
242         if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
243                 mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
244
245         free(where);
246
247         return 0;
248 }
249
250 static int copy_devnodes(const char *dest) {
251
252         static const char devnodes[] =
253                 "null\0"
254                 "zero\0"
255                 "full\0"
256                 "random\0"
257                 "urandom\0"
258                 "tty\0"
259                 "ptmx\0"
260                 "rtc0\0";
261
262         const char *d;
263         int r = 0;
264         mode_t u;
265
266         assert(dest);
267
268         u = umask(0000);
269
270         NULSTR_FOREACH(d, devnodes) {
271                 struct stat st;
272                 char *from = NULL, *to = NULL;
273
274                 asprintf(&from, "/dev/%s", d);
275                 asprintf(&to, "%s/dev/%s", dest, d);
276
277                 if (!from || !to) {
278                         log_error("Failed to allocate devnode path");
279
280                         free(from);
281                         free(to);
282
283                         from = to = NULL;
284
285                         if (r == 0)
286                                 r = -ENOMEM;
287
288                         break;
289                 }
290
291                 if (stat(from, &st) < 0) {
292
293                         if (errno != ENOENT) {
294                                 log_error("Failed to stat %s: %m", from);
295                                 if (r == 0)
296                                         r = -errno;
297                         }
298
299                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
300
301                         log_error("%s is not a char or block device, cannot copy.", from);
302                         if (r == 0)
303                                 r = -EIO;
304
305                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
306
307                         log_error("mknod(%s) failed: %m", dest);
308                         if (r == 0)
309                                 r = -errno;
310                 }
311
312                 free(from);
313                 free(to);
314         }
315
316         umask(u);
317
318         return r;
319 }
320
321 static int setup_dev_console(const char *dest, const char *console) {
322         struct stat st;
323         char *to = NULL;
324         int r;
325         mode_t u;
326
327         assert(dest);
328         assert(console);
329
330         u = umask(0000);
331
332         if (stat(console, &st) < 0) {
333                 log_error("Failed to stat %s: %m", console);
334                 r = -errno;
335                 goto finish;
336
337         } else if (!S_ISCHR(st.st_mode)) {
338                 log_error("/dev/console is not a char device.");
339                 r = -EIO;
340                 goto finish;
341         }
342
343         r = chmod_and_chown(console, 0600, 0, 0);
344         if (r < 0) {
345                 log_error("Failed to correct access mode for TTY: %s", strerror(-r));
346                 goto finish;
347         }
348
349         if (asprintf(&to, "%s/dev/console", dest) < 0) {
350                 log_error("Out of memory");
351                 r = -ENOMEM;
352                 goto finish;
353         }
354
355         /* We need to bind mount the right tty to /dev/console since
356          * ptys can only exist on pts file systems. To have something
357          * to bind mount things on we create a device node first, that
358          * has the right major/minor (note that the major minor
359          * doesn't actually matter here, since we mount it over
360          * anyway). */
361
362         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0) {
363                 log_error("mknod() for /dev/console failed: %m");
364                 r = -errno;
365                 goto finish;
366         }
367
368         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
369                 log_error("Bind mount for /dev/console failed: %m");
370                 r = -errno;
371                 goto finish;
372         }
373
374 finish:
375         free(to);
376         umask(u);
377
378         return r;
379 }
380
381 static int setup_kmsg(const char *dest, int kmsg_socket) {
382         char *from = NULL, *to = NULL;
383         int r, fd, k;
384         mode_t u;
385         union {
386                 struct cmsghdr cmsghdr;
387                 uint8_t buf[CMSG_SPACE(sizeof(int))];
388         } control;
389         struct msghdr mh;
390         struct cmsghdr *cmsg;
391
392         assert(dest);
393         assert(kmsg_socket >= 0);
394
395         u = umask(0000);
396
397         /* We create the kmsg FIFO as /dev/kmsg, but immediately
398          * delete it after bind mounting it to /proc/kmsg. While FIFOs
399          * on the reading side behave very similar to /proc/kmsg,
400          * their writing side behaves differently from /dev/kmsg in
401          * that writing blocks when nothing is reading. In order to
402          * avoid any problems with containers deadlocking due to this
403          * we simply make /dev/kmsg unavailable to the container. */
404         if (asprintf(&from, "%s/dev/kmsg", dest) < 0) {
405                 log_error("Out of memory");
406                 r = -ENOMEM;
407                 goto finish;
408         }
409
410         if (asprintf(&to, "%s/proc/kmsg", dest) < 0) {
411                 log_error("Out of memory");
412                 r = -ENOMEM;
413                 goto finish;
414         }
415
416         if (mkfifo(from, 0600) < 0) {
417                 log_error("mkfifo() for /dev/kmsg failed: %m");
418                 r = -errno;
419                 goto finish;
420         }
421
422         r = chmod_and_chown(from, 0600, 0, 0);
423         if (r < 0) {
424                 log_error("Failed to correct access mode for /dev/kmsg: %s", strerror(-r));
425                 goto finish;
426         }
427
428         if (mount(from, to, "bind", MS_BIND, NULL) < 0) {
429                 log_error("Bind mount for /proc/kmsg failed: %m");
430                 r = -errno;
431                 goto finish;
432         }
433
434         fd = open(from, O_RDWR|O_NDELAY|O_CLOEXEC);
435         if (fd < 0) {
436                 log_error("Failed to open fifo: %m");
437                 r = -errno;
438                 goto finish;
439         }
440
441         zero(mh);
442         zero(control);
443
444         mh.msg_control = &control;
445         mh.msg_controllen = sizeof(control);
446
447         cmsg = CMSG_FIRSTHDR(&mh);
448         cmsg->cmsg_level = SOL_SOCKET;
449         cmsg->cmsg_type = SCM_RIGHTS;
450         cmsg->cmsg_len = CMSG_LEN(sizeof(int));
451         memcpy(CMSG_DATA(cmsg), &fd, sizeof(int));
452
453         mh.msg_controllen = cmsg->cmsg_len;
454
455         /* Store away the fd in the socket, so that it stays open as
456          * long as we run the child */
457         k = sendmsg(kmsg_socket, &mh, MSG_DONTWAIT|MSG_NOSIGNAL);
458         close_nointr_nofail(fd);
459
460         if (k < 0) {
461                 log_error("Failed to send FIFO fd: %m");
462                 r = -errno;
463                 goto finish;
464         }
465
466         /* And now make the FIFO unavailable as /dev/kmsg... */
467         unlink(from);
468
469 finish:
470         free(from);
471         free(to);
472         umask(u);
473
474         return r;
475 }
476
477 static int drop_capabilities(void) {
478         static const unsigned long retain[] = {
479                 CAP_CHOWN,
480                 CAP_DAC_OVERRIDE,
481                 CAP_DAC_READ_SEARCH,
482                 CAP_FOWNER,
483                 CAP_FSETID,
484                 CAP_IPC_OWNER,
485                 CAP_KILL,
486                 CAP_LEASE,
487                 CAP_LINUX_IMMUTABLE,
488                 CAP_NET_BIND_SERVICE,
489                 CAP_NET_BROADCAST,
490                 CAP_NET_RAW,
491                 CAP_SETGID,
492                 CAP_SETFCAP,
493                 CAP_SETPCAP,
494                 CAP_SETUID,
495                 CAP_SYS_ADMIN,
496                 CAP_SYS_CHROOT,
497                 CAP_SYS_NICE,
498                 CAP_SYS_PTRACE,
499                 CAP_SYS_TTY_CONFIG
500         };
501
502         unsigned long l;
503
504         for (l = 0; l <= cap_last_cap(); l++) {
505                 unsigned i;
506
507                 for (i = 0; i < ELEMENTSOF(retain); i++)
508                         if (retain[i] == l)
509                                 break;
510
511                 if (i < ELEMENTSOF(retain))
512                         continue;
513
514                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
515                         log_error("PR_CAPBSET_DROP failed: %m");
516                         return -errno;
517                 }
518         }
519
520         return 0;
521 }
522
523 static int is_os_tree(const char *path) {
524         int r;
525         char *p;
526         /* We use /bin/sh as flag file if something is an OS */
527
528         if (asprintf(&p, "%s/bin/sh", path) < 0)
529                 return -ENOMEM;
530
531         r = access(p, F_OK);
532         free(p);
533
534         return r < 0 ? 0 : 1;
535 }
536
537 static int process_pty(int master, sigset_t *mask) {
538
539         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
540         size_t in_buffer_full = 0, out_buffer_full = 0;
541         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
542         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
543         int ep = -1, signal_fd = -1, r;
544
545         fd_nonblock(STDIN_FILENO, 1);
546         fd_nonblock(STDOUT_FILENO, 1);
547         fd_nonblock(master, 1);
548
549         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
550                 log_error("signalfd(): %m");
551                 r = -errno;
552                 goto finish;
553         }
554
555         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
556                 log_error("Failed to create epoll: %m");
557                 r = -errno;
558                 goto finish;
559         }
560
561         zero(stdin_ev);
562         stdin_ev.events = EPOLLIN|EPOLLET;
563         stdin_ev.data.fd = STDIN_FILENO;
564
565         zero(stdout_ev);
566         stdout_ev.events = EPOLLOUT|EPOLLET;
567         stdout_ev.data.fd = STDOUT_FILENO;
568
569         zero(master_ev);
570         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
571         master_ev.data.fd = master;
572
573         zero(signal_ev);
574         signal_ev.events = EPOLLIN;
575         signal_ev.data.fd = signal_fd;
576
577         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
578             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
579             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
580             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
581                 log_error("Failed to regiser fds in epoll: %m");
582                 r = -errno;
583                 goto finish;
584         }
585
586         for (;;) {
587                 struct epoll_event ev[16];
588                 ssize_t k;
589                 int i, nfds;
590
591                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
592
593                         if (errno == EINTR || errno == EAGAIN)
594                                 continue;
595
596                         log_error("epoll_wait(): %m");
597                         r = -errno;
598                         goto finish;
599                 }
600
601                 assert(nfds >= 1);
602
603                 for (i = 0; i < nfds; i++) {
604                         if (ev[i].data.fd == STDIN_FILENO) {
605
606                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
607                                         stdin_readable = true;
608
609                         } else if (ev[i].data.fd == STDOUT_FILENO) {
610
611                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
612                                         stdout_writable = true;
613
614                         } else if (ev[i].data.fd == master) {
615
616                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
617                                         master_readable = true;
618
619                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
620                                         master_writable = true;
621
622                         } else if (ev[i].data.fd == signal_fd) {
623                                 struct signalfd_siginfo sfsi;
624                                 ssize_t n;
625
626                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
627
628                                         if (n >= 0) {
629                                                 log_error("Failed to read from signalfd: invalid block size");
630                                                 r = -EIO;
631                                                 goto finish;
632                                         }
633
634                                         if (errno != EINTR && errno != EAGAIN) {
635                                                 log_error("Failed to read from signalfd: %m");
636                                                 r = -errno;
637                                                 goto finish;
638                                         }
639                                 } else {
640
641                                         if (sfsi.ssi_signo == SIGWINCH) {
642                                                 struct winsize ws;
643
644                                                 /* The window size changed, let's forward that. */
645                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
646                                                         ioctl(master, TIOCSWINSZ, &ws);
647                                         } else {
648                                                 r = 0;
649                                                 goto finish;
650                                         }
651                                 }
652                         }
653                 }
654
655                 while ((stdin_readable && in_buffer_full <= 0) ||
656                        (master_writable && in_buffer_full > 0) ||
657                        (master_readable && out_buffer_full <= 0) ||
658                        (stdout_writable && out_buffer_full > 0)) {
659
660                         if (stdin_readable && in_buffer_full < LINE_MAX) {
661
662                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
663
664                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
665                                                 stdin_readable = false;
666                                         else {
667                                                 log_error("read(): %m");
668                                                 r = -errno;
669                                                 goto finish;
670                                         }
671                                 } else
672                                         in_buffer_full += (size_t) k;
673                         }
674
675                         if (master_writable && in_buffer_full > 0) {
676
677                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
678
679                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
680                                                 master_writable = false;
681                                         else {
682                                                 log_error("write(): %m");
683                                                 r = -errno;
684                                                 goto finish;
685                                         }
686
687                                 } else {
688                                         assert(in_buffer_full >= (size_t) k);
689                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
690                                         in_buffer_full -= k;
691                                 }
692                         }
693
694                         if (master_readable && out_buffer_full < LINE_MAX) {
695
696                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
697
698                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
699                                                 master_readable = false;
700                                         else {
701                                                 log_error("read(): %m");
702                                                 r = -errno;
703                                                 goto finish;
704                                         }
705                                 }  else
706                                         out_buffer_full += (size_t) k;
707                         }
708
709                         if (stdout_writable && out_buffer_full > 0) {
710
711                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
712
713                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
714                                                 stdout_writable = false;
715                                         else {
716                                                 log_error("write(): %m");
717                                                 r = -errno;
718                                                 goto finish;
719                                         }
720
721                                 } else {
722                                         assert(out_buffer_full >= (size_t) k);
723                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
724                                         out_buffer_full -= k;
725                                 }
726                         }
727                 }
728         }
729
730 finish:
731         if (ep >= 0)
732                 close_nointr_nofail(ep);
733
734         if (signal_fd >= 0)
735                 close_nointr_nofail(signal_fd);
736
737         return r;
738 }
739
740 int main(int argc, char *argv[]) {
741         pid_t pid = 0;
742         int r = EXIT_FAILURE, k;
743         char *oldcg = NULL, *newcg = NULL;
744         char **controller = NULL;
745         int master = -1;
746         const char *console = NULL;
747         struct termios saved_attr, raw_attr;
748         sigset_t mask;
749         bool saved_attr_valid = false;
750         struct winsize ws;
751         int kmsg_socket_pair[2] = { -1, -1 };
752
753         log_parse_environment();
754         log_open();
755
756         if ((r = parse_argv(argc, argv)) <= 0)
757                 goto finish;
758
759         if (arg_directory) {
760                 char *p;
761
762                 p = path_make_absolute_cwd(arg_directory);
763                 free(arg_directory);
764                 arg_directory = p;
765         } else
766                 arg_directory = get_current_dir_name();
767
768         if (!arg_directory) {
769                 log_error("Failed to determine path");
770                 goto finish;
771         }
772
773         path_kill_slashes(arg_directory);
774
775         if (geteuid() != 0) {
776                 log_error("Need to be root.");
777                 goto finish;
778         }
779
780         if (sd_booted() <= 0) {
781                 log_error("Not running on a systemd system.");
782                 goto finish;
783         }
784
785         if (path_equal(arg_directory, "/")) {
786                 log_error("Spawning container on root directory not supported.");
787                 goto finish;
788         }
789
790         if (is_os_tree(arg_directory) <= 0) {
791                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
792                 goto finish;
793         }
794
795         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
796                 log_error("Failed to determine current cgroup: %s", strerror(-k));
797                 goto finish;
798         }
799
800         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
801                 log_error("Failed to allocate cgroup path.");
802                 goto finish;
803         }
804
805         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
806         if (k < 0)  {
807                 log_error("Failed to create cgroup: %s", strerror(-k));
808                 goto finish;
809         }
810
811         STRV_FOREACH(controller,arg_controllers) {
812                 k = cg_create_and_attach(*controller, newcg, 0);
813                 if (k < 0)
814                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
815         }
816
817         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
818                 log_error("Failed to acquire pseudo tty: %m");
819                 goto finish;
820         }
821
822         if (!(console = ptsname(master))) {
823                 log_error("Failed to determine tty name: %m");
824                 goto finish;
825         }
826
827         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
828
829         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
830                 ioctl(master, TIOCSWINSZ, &ws);
831
832         if (unlockpt(master) < 0) {
833                 log_error("Failed to unlock tty: %m");
834                 goto finish;
835         }
836
837         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
838                 log_error("Failed to get terminal attributes: %m");
839                 goto finish;
840         }
841
842         saved_attr_valid = true;
843
844         raw_attr = saved_attr;
845         cfmakeraw(&raw_attr);
846         raw_attr.c_lflag &= ~ECHO;
847
848         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
849                 log_error("Failed to set terminal attributes: %m");
850                 goto finish;
851         }
852
853         if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_NONBLOCK|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0) {
854                 log_error("Failed to create kmsg socket pair");
855                 goto finish;
856         }
857
858         assert_se(sigemptyset(&mask) == 0);
859         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
860         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
861
862         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
863         if (pid < 0) {
864                 if (errno == EINVAL)
865                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
866                 else
867                         log_error("clone() failed: %m");
868
869                 goto finish;
870         }
871
872         if (pid == 0) {
873                 /* child */
874
875                 const char *hn;
876                 const char *home = NULL;
877                 uid_t uid = (uid_t) -1;
878                 gid_t gid = (gid_t) -1;
879                 const char *envp[] = {
880                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
881                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
882                         NULL, /* TERM */
883                         NULL, /* HOME */
884                         NULL, /* USER */
885                         NULL, /* LOGNAME */
886                         NULL
887                 };
888
889                 envp[2] = strv_find_prefix(environ, "TERM=");
890
891                 close_nointr_nofail(master);
892
893                 close_nointr(STDIN_FILENO);
894                 close_nointr(STDOUT_FILENO);
895                 close_nointr(STDERR_FILENO);
896
897                 close_all_fds(&kmsg_socket_pair[1], 1);
898
899                 reset_all_signal_handlers();
900
901                 assert_se(sigemptyset(&mask) == 0);
902                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
903
904                 if (setsid() < 0)
905                         goto child_fail;
906
907                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
908                         goto child_fail;
909
910                 /* Mark / as private, in case somebody marked it shared */
911                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
912                         goto child_fail;
913
914                 if (mount_all(arg_directory) < 0)
915                         goto child_fail;
916
917                 if (copy_devnodes(arg_directory) < 0)
918                         goto child_fail;
919
920                 if (setup_dev_console(arg_directory, console) < 0)
921                         goto child_fail;
922
923                 if (setup_kmsg(arg_directory, kmsg_socket_pair[1]) < 0)
924                         goto child_fail;
925
926                 close_nointr_nofail(kmsg_socket_pair[1]);
927
928                 if (setup_timezone(arg_directory) < 0)
929                         goto child_fail;
930
931                 if (chdir(arg_directory) < 0) {
932                         log_error("chdir(%s) failed: %m", arg_directory);
933                         goto child_fail;
934                 }
935
936                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
937                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
938                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
939                         goto child_fail;
940
941                 if (mount(arg_directory, "/", "bind", MS_BIND, NULL) < 0) {
942                         log_error("mount(MS_MOVE) failed: %m");
943                         goto child_fail;
944                 }
945
946                 if (chroot(".") < 0) {
947                         log_error("chroot() failed: %m");
948                         goto child_fail;
949                 }
950
951                 if (chdir("/") < 0) {
952                         log_error("chdir() failed: %m");
953                         goto child_fail;
954                 }
955
956                 umask(0022);
957
958                 loopback_setup();
959
960                 if (drop_capabilities() < 0)
961                         goto child_fail;
962
963                 if (arg_user) {
964
965                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
966                                 log_error("get_user_creds() failed: %m");
967                                 goto child_fail;
968                         }
969
970                         if (mkdir_parents(home, 0775) < 0) {
971                                 log_error("mkdir_parents() failed: %m");
972                                 goto child_fail;
973                         }
974
975                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
976                                 log_error("safe_mkdir() failed: %m");
977                                 goto child_fail;
978                         }
979
980                         if (initgroups((const char*)arg_user, gid) < 0) {
981                                 log_error("initgroups() failed: %m");
982                                 goto child_fail;
983                         }
984
985                         if (setresgid(gid, gid, gid) < 0) {
986                                 log_error("setregid() failed: %m");
987                                 goto child_fail;
988                         }
989
990                         if (setresuid(uid, uid, uid) < 0) {
991                                 log_error("setreuid() failed: %m");
992                                 goto child_fail;
993                         }
994                 }
995
996                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
997                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
998                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
999                     log_error("Out of memory");
1000                     goto child_fail;
1001                 }
1002
1003                 if ((hn = file_name_from_path(arg_directory)))
1004                         sethostname(hn, strlen(hn));
1005
1006                 if (argc > optind)
1007                         execvpe(argv[optind], argv + optind, (char**) envp);
1008                 else {
1009                         chdir(home ? home : "/root");
1010                         execle("/bin/bash", "-bash", NULL, (char**) envp);
1011                 }
1012
1013                 log_error("execv() failed: %m");
1014
1015         child_fail:
1016                 _exit(EXIT_FAILURE);
1017         }
1018
1019         if (process_pty(master, &mask) < 0)
1020                 goto finish;
1021
1022         if (saved_attr_valid) {
1023                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1024                 saved_attr_valid = false;
1025         }
1026
1027         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
1028
1029         if (r < 0)
1030                 r = EXIT_FAILURE;
1031
1032 finish:
1033         if (saved_attr_valid)
1034                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
1035
1036         if (master >= 0)
1037                 close_nointr_nofail(master);
1038
1039         close_pipe(kmsg_socket_pair);
1040
1041         if (oldcg)
1042                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
1043
1044         if (newcg)
1045                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
1046
1047         free(arg_directory);
1048         strv_free(arg_controllers);
1049         free(oldcg);
1050         free(newcg);
1051
1052         return r;
1053 }