chiark / gitweb /
move more main systemd parts to core/
[elogind.git] / src / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41
42 #include <systemd/sd-daemon.h>
43
44 #include "log.h"
45 #include "util.h"
46 #include "mkdir.h"
47 #include "audit.h"
48 #include "missing.h"
49 #include "cgroup-util.h"
50 #include "strv.h"
51 #include "loopback-setup.h"
52
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static char **arg_controllers = NULL;
56 static bool arg_private_network = false;
57
58 static int help(void) {
59
60         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
61                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
62                "  -h --help             Show this help\n"
63                "  -D --directory=NAME   Root directory for the container\n"
64                "  -u --user=USER        Run the command under specified user or uid\n"
65                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
66                "     --private-network  Disable network in container\n",
67                program_invocation_short_name);
68
69         return 0;
70 }
71
72 static int parse_argv(int argc, char *argv[]) {
73
74         enum {
75                 ARG_PRIVATE_NETWORK = 0x100
76         };
77
78         static const struct option options[] = {
79                 { "help",            no_argument,       NULL, 'h'                 },
80                 { "directory",       required_argument, NULL, 'D'                 },
81                 { "user",            required_argument, NULL, 'u'                 },
82                 { "controllers",     required_argument, NULL, 'C'                 },
83                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
84                 { NULL,              0,                 NULL, 0                   }
85         };
86
87         int c;
88
89         assert(argc >= 0);
90         assert(argv);
91
92         while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
93
94                 switch (c) {
95
96                 case 'h':
97                         help();
98                         return 0;
99
100                 case 'D':
101                         free(arg_directory);
102                         if (!(arg_directory = strdup(optarg))) {
103                                 log_error("Failed to duplicate root directory.");
104                                 return -ENOMEM;
105                         }
106
107                         break;
108
109                 case 'u':
110                         free(arg_user);
111                         if (!(arg_user = strdup(optarg))) {
112                                 log_error("Failed to duplicate user name.");
113                                 return -ENOMEM;
114                         }
115
116                         break;
117
118                 case 'C':
119                         strv_free(arg_controllers);
120                         arg_controllers = strv_split(optarg, ",");
121                         if (!arg_controllers) {
122                                 log_error("Failed to split controllers list.");
123                                 return -ENOMEM;
124                         }
125                         strv_uniq(arg_controllers);
126
127                         break;
128
129                 case ARG_PRIVATE_NETWORK:
130                         arg_private_network = true;
131                         break;
132
133                 case '?':
134                         return -EINVAL;
135
136                 default:
137                         log_error("Unknown option code %c", c);
138                         return -EINVAL;
139                 }
140         }
141
142         return 1;
143 }
144
145 static int mount_all(const char *dest) {
146
147         typedef struct MountPoint {
148                 const char *what;
149                 const char *where;
150                 const char *type;
151                 const char *options;
152                 unsigned long flags;
153                 bool fatal;
154         } MountPoint;
155
156         static const MountPoint mount_table[] = {
157                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
158                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
159                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
160                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
161                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
162                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
163                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
164                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
165 #ifdef HAVE_SELINUX
166                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
167                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
168 #endif
169         };
170
171         unsigned k;
172         int r = 0;
173         char *where;
174
175         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
176                 int t;
177
178                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
179                         log_error("Out of memory");
180
181                         if (r == 0)
182                                 r = -ENOMEM;
183
184                         break;
185                 }
186
187                 t = path_is_mount_point(where, false);
188                 if (t < 0) {
189                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
190                         free(where);
191
192                         if (r == 0)
193                                 r = t;
194
195                         continue;
196                 }
197
198                 mkdir_p(where, 0755);
199
200                 if (mount(mount_table[k].what,
201                           where,
202                           mount_table[k].type,
203                           mount_table[k].flags,
204                           mount_table[k].options) < 0 &&
205                     mount_table[k].fatal) {
206
207                         log_error("mount(%s) failed: %m", where);
208
209                         if (r == 0)
210                                 r = -errno;
211                 }
212
213                 free(where);
214         }
215
216         /* Fix the timezone, if possible */
217         if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
218
219                 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
220                         mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
221
222                 free(where);
223         }
224
225         if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
226
227                 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
228                         mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
229
230                 free(where);
231         }
232
233         if (asprintf(&where, "%s/proc/kmsg", dest) >= 0) {
234                 mount("/dev/null", where, "bind", MS_BIND, NULL);
235                 free(where);
236         }
237
238         return r;
239 }
240
241 static int copy_devnodes(const char *dest, const char *console) {
242
243         static const char devnodes[] =
244                 "null\0"
245                 "zero\0"
246                 "full\0"
247                 "random\0"
248                 "urandom\0"
249                 "tty\0"
250                 "ptmx\0"
251                 "kmsg\0"
252                 "rtc0\0";
253
254         const char *d;
255         int r = 0, k;
256         mode_t u;
257         struct stat st;
258         char *from = NULL, *to = NULL;
259
260         assert(dest);
261         assert(console);
262
263         u = umask(0000);
264
265         NULSTR_FOREACH(d, devnodes) {
266                 from = to = NULL;
267
268                 asprintf(&from, "/dev/%s", d);
269                 asprintf(&to, "%s/dev/%s", dest, d);
270
271                 if (!from || !to) {
272                         log_error("Failed to allocate devnode path");
273
274                         free(from);
275                         free(to);
276
277                         from = to = NULL;
278
279                         if (r == 0)
280                                 r = -ENOMEM;
281
282                         break;
283                 }
284
285                 if (stat(from, &st) < 0) {
286
287                         if (errno != ENOENT) {
288                                 log_error("Failed to stat %s: %m", from);
289                                 if (r == 0)
290                                         r = -errno;
291                         }
292
293                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
294
295                         log_error("%s is not a char or block device, cannot copy.", from);
296                         if (r == 0)
297                                 r = -EIO;
298
299                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
300
301                         log_error("mknod(%s) failed: %m", dest);
302                         if (r == 0)
303                                 r = -errno;
304                 }
305
306                 free(from);
307                 free(to);
308         }
309
310         if (stat(console, &st) < 0) {
311
312                 log_error("Failed to stat %s: %m", console);
313                 if (r == 0)
314                         r = -errno;
315
316                 goto finish;
317
318         } else if (!S_ISCHR(st.st_mode)) {
319
320                 log_error("/dev/console is not a char device.");
321                 if (r == 0)
322                         r = -EIO;
323
324                 goto finish;
325         }
326
327         if (asprintf(&to, "%s/dev/console", dest) < 0) {
328
329                 log_error("Out of memory");
330                 if (r == 0)
331                         r = -ENOMEM;
332
333                  goto finish;
334         }
335
336         /* We need to bind mount the right tty to /dev/console since
337          * ptys can only exist on pts file systems. To have something
338          * to bind mount things on we create a device node first, that
339          * has the right major/minor (note that the major minor
340          * doesn't actually matter here, since we mount it over
341          * anyway). */
342
343         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
344                 log_error("mknod for /dev/console failed: %m");
345
346         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
347                 log_error("bind mount for /dev/console failed: %m");
348
349                 if (r == 0)
350                         r = -errno;
351         }
352
353         free(to);
354
355         if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
356                 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
357
358                 if (r == 0)
359                         r = k;
360         }
361
362 finish:
363         umask(u);
364
365         return r;
366 }
367
368 static int drop_capabilities(void) {
369         static const unsigned long retain[] = {
370                 CAP_CHOWN,
371                 CAP_DAC_OVERRIDE,
372                 CAP_DAC_READ_SEARCH,
373                 CAP_FOWNER,
374                 CAP_FSETID,
375                 CAP_IPC_OWNER,
376                 CAP_KILL,
377                 CAP_LEASE,
378                 CAP_LINUX_IMMUTABLE,
379                 CAP_NET_BIND_SERVICE,
380                 CAP_NET_BROADCAST,
381                 CAP_NET_RAW,
382                 CAP_SETGID,
383                 CAP_SETFCAP,
384                 CAP_SETPCAP,
385                 CAP_SETUID,
386                 CAP_SYS_ADMIN,
387                 CAP_SYS_CHROOT,
388                 CAP_SYS_NICE,
389                 CAP_SYS_PTRACE,
390                 CAP_SYS_TTY_CONFIG
391         };
392
393         unsigned long l;
394
395         for (l = 0; l <= cap_last_cap(); l++) {
396                 unsigned i;
397
398                 for (i = 0; i < ELEMENTSOF(retain); i++)
399                         if (retain[i] == l)
400                                 break;
401
402                 if (i < ELEMENTSOF(retain))
403                         continue;
404
405                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
406                         log_error("PR_CAPBSET_DROP failed: %m");
407                         return -errno;
408                 }
409         }
410
411         return 0;
412 }
413
414 static int is_os_tree(const char *path) {
415         int r;
416         char *p;
417         /* We use /bin/sh as flag file if something is an OS */
418
419         if (asprintf(&p, "%s/bin/sh", path) < 0)
420                 return -ENOMEM;
421
422         r = access(p, F_OK);
423         free(p);
424
425         return r < 0 ? 0 : 1;
426 }
427
428 static int process_pty(int master, sigset_t *mask) {
429
430         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
431         size_t in_buffer_full = 0, out_buffer_full = 0;
432         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
433         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
434         int ep = -1, signal_fd = -1, r;
435
436         fd_nonblock(STDIN_FILENO, 1);
437         fd_nonblock(STDOUT_FILENO, 1);
438         fd_nonblock(master, 1);
439
440         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
441                 log_error("signalfd(): %m");
442                 r = -errno;
443                 goto finish;
444         }
445
446         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
447                 log_error("Failed to create epoll: %m");
448                 r = -errno;
449                 goto finish;
450         }
451
452         zero(stdin_ev);
453         stdin_ev.events = EPOLLIN|EPOLLET;
454         stdin_ev.data.fd = STDIN_FILENO;
455
456         zero(stdout_ev);
457         stdout_ev.events = EPOLLOUT|EPOLLET;
458         stdout_ev.data.fd = STDOUT_FILENO;
459
460         zero(master_ev);
461         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
462         master_ev.data.fd = master;
463
464         zero(signal_ev);
465         signal_ev.events = EPOLLIN;
466         signal_ev.data.fd = signal_fd;
467
468         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
469             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
470             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
471             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
472                 log_error("Failed to regiser fds in epoll: %m");
473                 r = -errno;
474                 goto finish;
475         }
476
477         for (;;) {
478                 struct epoll_event ev[16];
479                 ssize_t k;
480                 int i, nfds;
481
482                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
483
484                         if (errno == EINTR || errno == EAGAIN)
485                                 continue;
486
487                         log_error("epoll_wait(): %m");
488                         r = -errno;
489                         goto finish;
490                 }
491
492                 assert(nfds >= 1);
493
494                 for (i = 0; i < nfds; i++) {
495                         if (ev[i].data.fd == STDIN_FILENO) {
496
497                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
498                                         stdin_readable = true;
499
500                         } else if (ev[i].data.fd == STDOUT_FILENO) {
501
502                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
503                                         stdout_writable = true;
504
505                         } else if (ev[i].data.fd == master) {
506
507                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
508                                         master_readable = true;
509
510                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
511                                         master_writable = true;
512
513                         } else if (ev[i].data.fd == signal_fd) {
514                                 struct signalfd_siginfo sfsi;
515                                 ssize_t n;
516
517                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
518
519                                         if (n >= 0) {
520                                                 log_error("Failed to read from signalfd: invalid block size");
521                                                 r = -EIO;
522                                                 goto finish;
523                                         }
524
525                                         if (errno != EINTR && errno != EAGAIN) {
526                                                 log_error("Failed to read from signalfd: %m");
527                                                 r = -errno;
528                                                 goto finish;
529                                         }
530                                 } else {
531
532                                         if (sfsi.ssi_signo == SIGWINCH) {
533                                                 struct winsize ws;
534
535                                                 /* The window size changed, let's forward that. */
536                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
537                                                         ioctl(master, TIOCSWINSZ, &ws);
538                                         } else {
539                                                 r = 0;
540                                                 goto finish;
541                                         }
542                                 }
543                         }
544                 }
545
546                 while ((stdin_readable && in_buffer_full <= 0) ||
547                        (master_writable && in_buffer_full > 0) ||
548                        (master_readable && out_buffer_full <= 0) ||
549                        (stdout_writable && out_buffer_full > 0)) {
550
551                         if (stdin_readable && in_buffer_full < LINE_MAX) {
552
553                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
554
555                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
556                                                 stdin_readable = false;
557                                         else {
558                                                 log_error("read(): %m");
559                                                 r = -errno;
560                                                 goto finish;
561                                         }
562                                 } else
563                                         in_buffer_full += (size_t) k;
564                         }
565
566                         if (master_writable && in_buffer_full > 0) {
567
568                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
569
570                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
571                                                 master_writable = false;
572                                         else {
573                                                 log_error("write(): %m");
574                                                 r = -errno;
575                                                 goto finish;
576                                         }
577
578                                 } else {
579                                         assert(in_buffer_full >= (size_t) k);
580                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
581                                         in_buffer_full -= k;
582                                 }
583                         }
584
585                         if (master_readable && out_buffer_full < LINE_MAX) {
586
587                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
588
589                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
590                                                 master_readable = false;
591                                         else {
592                                                 log_error("read(): %m");
593                                                 r = -errno;
594                                                 goto finish;
595                                         }
596                                 }  else
597                                         out_buffer_full += (size_t) k;
598                         }
599
600                         if (stdout_writable && out_buffer_full > 0) {
601
602                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
603
604                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
605                                                 stdout_writable = false;
606                                         else {
607                                                 log_error("write(): %m");
608                                                 r = -errno;
609                                                 goto finish;
610                                         }
611
612                                 } else {
613                                         assert(out_buffer_full >= (size_t) k);
614                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
615                                         out_buffer_full -= k;
616                                 }
617                         }
618                 }
619         }
620
621 finish:
622         if (ep >= 0)
623                 close_nointr_nofail(ep);
624
625         if (signal_fd >= 0)
626                 close_nointr_nofail(signal_fd);
627
628         return r;
629 }
630
631 int main(int argc, char *argv[]) {
632         pid_t pid = 0;
633         int r = EXIT_FAILURE, k;
634         char *oldcg = NULL, *newcg = NULL;
635         char **controller = NULL;
636         int master = -1;
637         const char *console = NULL;
638         struct termios saved_attr, raw_attr;
639         sigset_t mask;
640         bool saved_attr_valid = false;
641         struct winsize ws;
642
643         log_parse_environment();
644         log_open();
645
646         if ((r = parse_argv(argc, argv)) <= 0)
647                 goto finish;
648
649         if (arg_directory) {
650                 char *p;
651
652                 p = path_make_absolute_cwd(arg_directory);
653                 free(arg_directory);
654                 arg_directory = p;
655         } else
656                 arg_directory = get_current_dir_name();
657
658         if (!arg_directory) {
659                 log_error("Failed to determine path");
660                 goto finish;
661         }
662
663         path_kill_slashes(arg_directory);
664
665         if (geteuid() != 0) {
666                 log_error("Need to be root.");
667                 goto finish;
668         }
669
670         if (sd_booted() <= 0) {
671                 log_error("Not running on a systemd system.");
672                 goto finish;
673         }
674
675         if (path_equal(arg_directory, "/")) {
676                 log_error("Spawning container on root directory not supported.");
677                 goto finish;
678         }
679
680         if (is_os_tree(arg_directory) <= 0) {
681                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
682                 goto finish;
683         }
684
685         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
686                 log_error("Failed to determine current cgroup: %s", strerror(-k));
687                 goto finish;
688         }
689
690         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
691                 log_error("Failed to allocate cgroup path.");
692                 goto finish;
693         }
694
695         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
696         if (k < 0)  {
697                 log_error("Failed to create cgroup: %s", strerror(-k));
698                 goto finish;
699         }
700
701         STRV_FOREACH(controller,arg_controllers) {
702                 k = cg_create_and_attach(*controller, newcg, 0);
703                 if (k < 0)
704                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
705         }
706
707         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
708                 log_error("Failed to acquire pseudo tty: %m");
709                 goto finish;
710         }
711
712         if (!(console = ptsname(master))) {
713                 log_error("Failed to determine tty name: %m");
714                 goto finish;
715         }
716
717         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
718
719         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
720                 ioctl(master, TIOCSWINSZ, &ws);
721
722         if (unlockpt(master) < 0) {
723                 log_error("Failed to unlock tty: %m");
724                 goto finish;
725         }
726
727         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
728                 log_error("Failed to get terminal attributes: %m");
729                 goto finish;
730         }
731
732         saved_attr_valid = true;
733
734         raw_attr = saved_attr;
735         cfmakeraw(&raw_attr);
736         raw_attr.c_lflag &= ~ECHO;
737
738         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
739                 log_error("Failed to set terminal attributes: %m");
740                 goto finish;
741         }
742
743         assert_se(sigemptyset(&mask) == 0);
744         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
745         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
746
747         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
748         if (pid < 0) {
749                 if (errno == EINVAL)
750                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
751                 else
752                         log_error("clone() failed: %m");
753
754                 goto finish;
755         }
756
757         if (pid == 0) {
758                 /* child */
759
760                 const char *hn;
761                 const char *home = NULL;
762                 uid_t uid = (uid_t) -1;
763                 gid_t gid = (gid_t) -1;
764                 const char *envp[] = {
765                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
766                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
767                         NULL, /* TERM */
768                         NULL, /* HOME */
769                         NULL, /* USER */
770                         NULL, /* LOGNAME */
771                         NULL
772                 };
773
774                 envp[2] = strv_find_prefix(environ, "TERM=");
775
776                 close_nointr_nofail(master);
777
778                 close_nointr(STDIN_FILENO);
779                 close_nointr(STDOUT_FILENO);
780                 close_nointr(STDERR_FILENO);
781
782                 close_all_fds(NULL, 0);
783
784                 reset_all_signal_handlers();
785
786                 assert_se(sigemptyset(&mask) == 0);
787                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
788
789                 if (setsid() < 0)
790                         goto child_fail;
791
792                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
793                         goto child_fail;
794
795                 /* Mark / as private, in case somebody marked it shared */
796                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
797                         goto child_fail;
798
799                 if (mount_all(arg_directory) < 0)
800                         goto child_fail;
801
802                 if (copy_devnodes(arg_directory, console) < 0)
803                         goto child_fail;
804
805                 if (chdir(arg_directory) < 0) {
806                         log_error("chdir(%s) failed: %m", arg_directory);
807                         goto child_fail;
808                 }
809
810                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
811                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
812                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
813                         goto child_fail;
814
815                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
816                         log_error("mount(MS_MOVE) failed: %m");
817                         goto child_fail;
818                 }
819
820                 if (chroot(".") < 0) {
821                         log_error("chroot() failed: %m");
822                         goto child_fail;
823                 }
824
825                 if (chdir("/") < 0) {
826                         log_error("chdir() failed: %m");
827                         goto child_fail;
828                 }
829
830                 umask(0022);
831
832                 loopback_setup();
833
834                 if (drop_capabilities() < 0)
835                         goto child_fail;
836
837                 if (arg_user) {
838
839                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
840                                 log_error("get_user_creds() failed: %m");
841                                 goto child_fail;
842                         }
843
844                         if (mkdir_parents(home, 0775) < 0) {
845                                 log_error("mkdir_parents() failed: %m");
846                                 goto child_fail;
847                         }
848
849                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
850                                 log_error("safe_mkdir() failed: %m");
851                                 goto child_fail;
852                         }
853
854                         if (initgroups((const char*)arg_user, gid) < 0) {
855                                 log_error("initgroups() failed: %m");
856                                 goto child_fail;
857                         }
858
859                         if (setresgid(gid, gid, gid) < 0) {
860                                 log_error("setregid() failed: %m");
861                                 goto child_fail;
862                         }
863
864                         if (setresuid(uid, uid, uid) < 0) {
865                                 log_error("setreuid() failed: %m");
866                                 goto child_fail;
867                         }
868                 }
869
870                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
871                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
872                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
873                     log_error("Out of memory");
874                     goto child_fail;
875                 }
876
877                 if ((hn = file_name_from_path(arg_directory)))
878                         sethostname(hn, strlen(hn));
879
880                 if (argc > optind)
881                         execvpe(argv[optind], argv + optind, (char**) envp);
882                 else {
883                         chdir(home ? home : "/root");
884                         execle("/bin/bash", "-bash", NULL, (char**) envp);
885                 }
886
887                 log_error("execv() failed: %m");
888
889         child_fail:
890                 _exit(EXIT_FAILURE);
891         }
892
893         if (process_pty(master, &mask) < 0)
894                 goto finish;
895
896         if (saved_attr_valid) {
897                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
898                 saved_attr_valid = false;
899         }
900
901         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
902
903         if (r < 0)
904                 r = EXIT_FAILURE;
905
906 finish:
907         if (saved_attr_valid)
908                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
909
910         if (master >= 0)
911                 close_nointr_nofail(master);
912
913         if (oldcg)
914                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
915
916         if (newcg)
917                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
918
919         free(arg_directory);
920         strv_free(arg_controllers);
921         free(oldcg);
922         free(newcg);
923
924         return r;
925 }