chiark / gitweb /
units: start vconsole-setup only if there's actually a virtual console device
[elogind.git] / src / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41
42 #include <systemd/sd-daemon.h>
43
44 #include "log.h"
45 #include "util.h"
46 #include "mkdir.h"
47 #include "audit.h"
48 #include "missing.h"
49 #include "cgroup-util.h"
50 #include "strv.h"
51 #include "loopback-setup.h"
52
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static char **arg_controllers = NULL;
56 static bool arg_private_network = false;
57
58 static int help(void) {
59
60         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
61                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
62                "  -h --help             Show this help\n"
63                "  -D --directory=NAME   Root directory for the container\n"
64                "  -u --user=USER        Run the command under specified user or uid\n"
65                "  -C --controllers=LIST Put the container in specified comma-separated cgroup hierarchies\n"
66                "     --private-network  Disable network in container\n",
67                program_invocation_short_name);
68
69         return 0;
70 }
71
72 static int parse_argv(int argc, char *argv[]) {
73
74         enum {
75                 ARG_PRIVATE_NETWORK = 0x100
76         };
77
78         static const struct option options[] = {
79                 { "help",            no_argument,       NULL, 'h'                 },
80                 { "directory",       required_argument, NULL, 'D'                 },
81                 { "user",            required_argument, NULL, 'u'                 },
82                 { "controllers",     required_argument, NULL, 'C'                 },
83                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
84                 { NULL,              0,                 NULL, 0                   }
85         };
86
87         int c;
88
89         assert(argc >= 0);
90         assert(argv);
91
92         while ((c = getopt_long(argc, argv, "+hD:u:C:", options, NULL)) >= 0) {
93
94                 switch (c) {
95
96                 case 'h':
97                         help();
98                         return 0;
99
100                 case 'D':
101                         free(arg_directory);
102                         if (!(arg_directory = strdup(optarg))) {
103                                 log_error("Failed to duplicate root directory.");
104                                 return -ENOMEM;
105                         }
106
107                         break;
108
109                 case 'u':
110                         free(arg_user);
111                         if (!(arg_user = strdup(optarg))) {
112                                 log_error("Failed to duplicate user name.");
113                                 return -ENOMEM;
114                         }
115
116                         break;
117
118                 case 'C':
119                         strv_free(arg_controllers);
120                         arg_controllers = strv_split(optarg, ",");
121                         if (!arg_controllers) {
122                                 log_error("Failed to split controllers list.");
123                                 return -ENOMEM;
124                         }
125                         strv_uniq(arg_controllers);
126
127                         break;
128
129                 case ARG_PRIVATE_NETWORK:
130                         arg_private_network = true;
131                         break;
132
133                 case '?':
134                         return -EINVAL;
135
136                 default:
137                         log_error("Unknown option code %c", c);
138                         return -EINVAL;
139                 }
140         }
141
142         return 1;
143 }
144
145 static int mount_all(const char *dest) {
146
147         typedef struct MountPoint {
148                 const char *what;
149                 const char *where;
150                 const char *type;
151                 const char *options;
152                 unsigned long flags;
153                 bool fatal;
154         } MountPoint;
155
156         static const MountPoint mount_table[] = {
157                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
158                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
159                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
160                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
161                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
162                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
163                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
164                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
165 #ifdef HAVE_SELINUX
166                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
167                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
168 #endif
169         };
170
171         unsigned k;
172         int r = 0;
173         char *where;
174
175         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
176                 int t;
177
178                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
179                         log_error("Out of memory");
180
181                         if (r == 0)
182                                 r = -ENOMEM;
183
184                         break;
185                 }
186
187                 if ((t = path_is_mount_point(where, false)) < 0) {
188                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
189                         free(where);
190
191                         if (r == 0)
192                                 r = t;
193
194                         continue;
195                 }
196
197                 mkdir_p(where, 0755);
198
199                 if (mount(mount_table[k].what,
200                           where,
201                           mount_table[k].type,
202                           mount_table[k].flags,
203                           mount_table[k].options) < 0 &&
204                     mount_table[k].fatal) {
205
206                         log_error("mount(%s) failed: %m", where);
207
208                         if (r == 0)
209                                 r = -errno;
210                 }
211
212                 free(where);
213         }
214
215         /* Fix the timezone, if possible */
216         if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
217
218                 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
219                         mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
220
221                 free(where);
222         }
223
224         if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
225
226                 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
227                         mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
228
229                 free(where);
230         }
231
232         return r;
233 }
234
235 static int copy_devnodes(const char *dest, const char *console) {
236
237         static const char devnodes[] =
238                 "null\0"
239                 "zero\0"
240                 "full\0"
241                 "random\0"
242                 "urandom\0"
243                 "tty\0"
244                 "ptmx\0"
245                 "kmsg\0"
246                 "rtc0\0";
247
248         const char *d;
249         int r = 0, k;
250         mode_t u;
251         struct stat st;
252         char *from = NULL, *to = NULL;
253
254         assert(dest);
255         assert(console);
256
257         u = umask(0000);
258
259         NULSTR_FOREACH(d, devnodes) {
260                 from = to = NULL;
261
262                 asprintf(&from, "/dev/%s", d);
263                 asprintf(&to, "%s/dev/%s", dest, d);
264
265                 if (!from || !to) {
266                         log_error("Failed to allocate devnode path");
267
268                         free(from);
269                         free(to);
270
271                         from = to = NULL;
272
273                         if (r == 0)
274                                 r = -ENOMEM;
275
276                         break;
277                 }
278
279                 if (stat(from, &st) < 0) {
280
281                         if (errno != ENOENT) {
282                                 log_error("Failed to stat %s: %m", from);
283                                 if (r == 0)
284                                         r = -errno;
285                         }
286
287                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
288
289                         log_error("%s is not a char or block device, cannot copy.", from);
290                         if (r == 0)
291                                 r = -EIO;
292
293                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
294
295                         log_error("mknod(%s) failed: %m", dest);
296                         if (r == 0)
297                                 r = -errno;
298                 }
299
300                 free(from);
301                 free(to);
302         }
303
304         if (stat(console, &st) < 0) {
305
306                 log_error("Failed to stat %s: %m", console);
307                 if (r == 0)
308                         r = -errno;
309
310                 goto finish;
311
312         } else if (!S_ISCHR(st.st_mode)) {
313
314                 log_error("/dev/console is not a char device.");
315                 if (r == 0)
316                         r = -EIO;
317
318                 goto finish;
319         }
320
321         if (asprintf(&to, "%s/dev/console", dest) < 0) {
322
323                 log_error("Out of memory");
324                 if (r == 0)
325                         r = -ENOMEM;
326
327                  goto finish;
328         }
329
330         /* We need to bind mount the right tty to /dev/console since
331          * ptys can only exist on pts file systems. To have something
332          * to bind mount things on we create a device node first, that
333          * has the right major/minor (note that the major minor
334          * doesn't actually matter here, since we mount it over
335          * anyway). */
336
337         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
338                 log_error("mknod for /dev/console failed: %m");
339
340         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
341                 log_error("bind mount for /dev/console failed: %m");
342
343                 if (r == 0)
344                         r = -errno;
345         }
346
347         free(to);
348
349         if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
350                 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
351
352                 if (r == 0)
353                         r = k;
354         }
355
356 finish:
357         umask(u);
358
359         return r;
360 }
361
362 static int drop_capabilities(void) {
363         static const unsigned long retain[] = {
364                 CAP_CHOWN,
365                 CAP_DAC_OVERRIDE,
366                 CAP_DAC_READ_SEARCH,
367                 CAP_FOWNER,
368                 CAP_FSETID,
369                 CAP_IPC_OWNER,
370                 CAP_KILL,
371                 CAP_LEASE,
372                 CAP_LINUX_IMMUTABLE,
373                 CAP_NET_BIND_SERVICE,
374                 CAP_NET_BROADCAST,
375                 CAP_NET_RAW,
376                 CAP_SETGID,
377                 CAP_SETFCAP,
378                 CAP_SETPCAP,
379                 CAP_SETUID,
380                 CAP_SYS_ADMIN,
381                 CAP_SYS_CHROOT,
382                 CAP_SYS_NICE,
383                 CAP_SYS_PTRACE,
384                 CAP_SYS_TTY_CONFIG
385         };
386
387         unsigned long l;
388
389         for (l = 0; l <= cap_last_cap(); l++) {
390                 unsigned i;
391
392                 for (i = 0; i < ELEMENTSOF(retain); i++)
393                         if (retain[i] == l)
394                                 break;
395
396                 if (i < ELEMENTSOF(retain))
397                         continue;
398
399                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
400                         log_error("PR_CAPBSET_DROP failed: %m");
401                         return -errno;
402                 }
403         }
404
405         return 0;
406 }
407
408 static int is_os_tree(const char *path) {
409         int r;
410         char *p;
411         /* We use /bin/sh as flag file if something is an OS */
412
413         if (asprintf(&p, "%s/bin/sh", path) < 0)
414                 return -ENOMEM;
415
416         r = access(p, F_OK);
417         free(p);
418
419         return r < 0 ? 0 : 1;
420 }
421
422 static int process_pty(int master, sigset_t *mask) {
423
424         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
425         size_t in_buffer_full = 0, out_buffer_full = 0;
426         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
427         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
428         int ep = -1, signal_fd = -1, r;
429
430         fd_nonblock(STDIN_FILENO, 1);
431         fd_nonblock(STDOUT_FILENO, 1);
432         fd_nonblock(master, 1);
433
434         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
435                 log_error("signalfd(): %m");
436                 r = -errno;
437                 goto finish;
438         }
439
440         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
441                 log_error("Failed to create epoll: %m");
442                 r = -errno;
443                 goto finish;
444         }
445
446         zero(stdin_ev);
447         stdin_ev.events = EPOLLIN|EPOLLET;
448         stdin_ev.data.fd = STDIN_FILENO;
449
450         zero(stdout_ev);
451         stdout_ev.events = EPOLLOUT|EPOLLET;
452         stdout_ev.data.fd = STDOUT_FILENO;
453
454         zero(master_ev);
455         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
456         master_ev.data.fd = master;
457
458         zero(signal_ev);
459         signal_ev.events = EPOLLIN;
460         signal_ev.data.fd = signal_fd;
461
462         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
463             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
464             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
465             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
466                 log_error("Failed to regiser fds in epoll: %m");
467                 r = -errno;
468                 goto finish;
469         }
470
471         for (;;) {
472                 struct epoll_event ev[16];
473                 ssize_t k;
474                 int i, nfds;
475
476                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
477
478                         if (errno == EINTR || errno == EAGAIN)
479                                 continue;
480
481                         log_error("epoll_wait(): %m");
482                         r = -errno;
483                         goto finish;
484                 }
485
486                 assert(nfds >= 1);
487
488                 for (i = 0; i < nfds; i++) {
489                         if (ev[i].data.fd == STDIN_FILENO) {
490
491                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
492                                         stdin_readable = true;
493
494                         } else if (ev[i].data.fd == STDOUT_FILENO) {
495
496                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
497                                         stdout_writable = true;
498
499                         } else if (ev[i].data.fd == master) {
500
501                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
502                                         master_readable = true;
503
504                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
505                                         master_writable = true;
506
507                         } else if (ev[i].data.fd == signal_fd) {
508                                 struct signalfd_siginfo sfsi;
509                                 ssize_t n;
510
511                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
512
513                                         if (n >= 0) {
514                                                 log_error("Failed to read from signalfd: invalid block size");
515                                                 r = -EIO;
516                                                 goto finish;
517                                         }
518
519                                         if (errno != EINTR && errno != EAGAIN) {
520                                                 log_error("Failed to read from signalfd: %m");
521                                                 r = -errno;
522                                                 goto finish;
523                                         }
524                                 } else {
525
526                                         if (sfsi.ssi_signo == SIGWINCH) {
527                                                 struct winsize ws;
528
529                                                 /* The window size changed, let's forward that. */
530                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
531                                                         ioctl(master, TIOCSWINSZ, &ws);
532                                         } else {
533                                                 r = 0;
534                                                 goto finish;
535                                         }
536                                 }
537                         }
538                 }
539
540                 while ((stdin_readable && in_buffer_full <= 0) ||
541                        (master_writable && in_buffer_full > 0) ||
542                        (master_readable && out_buffer_full <= 0) ||
543                        (stdout_writable && out_buffer_full > 0)) {
544
545                         if (stdin_readable && in_buffer_full < LINE_MAX) {
546
547                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
548
549                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
550                                                 stdin_readable = false;
551                                         else {
552                                                 log_error("read(): %m");
553                                                 r = -errno;
554                                                 goto finish;
555                                         }
556                                 } else
557                                         in_buffer_full += (size_t) k;
558                         }
559
560                         if (master_writable && in_buffer_full > 0) {
561
562                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
563
564                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
565                                                 master_writable = false;
566                                         else {
567                                                 log_error("write(): %m");
568                                                 r = -errno;
569                                                 goto finish;
570                                         }
571
572                                 } else {
573                                         assert(in_buffer_full >= (size_t) k);
574                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
575                                         in_buffer_full -= k;
576                                 }
577                         }
578
579                         if (master_readable && out_buffer_full < LINE_MAX) {
580
581                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
582
583                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
584                                                 master_readable = false;
585                                         else {
586                                                 log_error("read(): %m");
587                                                 r = -errno;
588                                                 goto finish;
589                                         }
590                                 }  else
591                                         out_buffer_full += (size_t) k;
592                         }
593
594                         if (stdout_writable && out_buffer_full > 0) {
595
596                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
597
598                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
599                                                 stdout_writable = false;
600                                         else {
601                                                 log_error("write(): %m");
602                                                 r = -errno;
603                                                 goto finish;
604                                         }
605
606                                 } else {
607                                         assert(out_buffer_full >= (size_t) k);
608                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
609                                         out_buffer_full -= k;
610                                 }
611                         }
612                 }
613         }
614
615 finish:
616         if (ep >= 0)
617                 close_nointr_nofail(ep);
618
619         if (signal_fd >= 0)
620                 close_nointr_nofail(signal_fd);
621
622         return r;
623 }
624
625 int main(int argc, char *argv[]) {
626         pid_t pid = 0;
627         int r = EXIT_FAILURE, k;
628         char *oldcg = NULL, *newcg = NULL;
629         char **controller = NULL;
630         int master = -1;
631         const char *console = NULL;
632         struct termios saved_attr, raw_attr;
633         sigset_t mask;
634         bool saved_attr_valid = false;
635         struct winsize ws;
636
637         log_parse_environment();
638         log_open();
639
640         if ((r = parse_argv(argc, argv)) <= 0)
641                 goto finish;
642
643         if (arg_directory) {
644                 char *p;
645
646                 p = path_make_absolute_cwd(arg_directory);
647                 free(arg_directory);
648                 arg_directory = p;
649         } else
650                 arg_directory = get_current_dir_name();
651
652         if (!arg_directory) {
653                 log_error("Failed to determine path");
654                 goto finish;
655         }
656
657         path_kill_slashes(arg_directory);
658
659         if (geteuid() != 0) {
660                 log_error("Need to be root.");
661                 goto finish;
662         }
663
664         if (sd_booted() <= 0) {
665                 log_error("Not running on a systemd system.");
666                 goto finish;
667         }
668
669         if (path_equal(arg_directory, "/")) {
670                 log_error("Spawning container on root directory not supported.");
671                 goto finish;
672         }
673
674         if (is_os_tree(arg_directory) <= 0) {
675                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
676                 goto finish;
677         }
678
679         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
680                 log_error("Failed to determine current cgroup: %s", strerror(-k));
681                 goto finish;
682         }
683
684         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
685                 log_error("Failed to allocate cgroup path.");
686                 goto finish;
687         }
688
689         k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0);
690         if (k < 0)  {
691                 log_error("Failed to create cgroup: %s", strerror(-k));
692                 goto finish;
693         }
694
695         STRV_FOREACH(controller,arg_controllers) {
696                 k = cg_create_and_attach(*controller, newcg, 0);
697                 if (k < 0)
698                         log_warning("Failed to create cgroup in controller %s: %s", *controller, strerror(-k));
699         }
700
701         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
702                 log_error("Failed to acquire pseudo tty: %m");
703                 goto finish;
704         }
705
706         if (!(console = ptsname(master))) {
707                 log_error("Failed to determine tty name: %m");
708                 goto finish;
709         }
710
711         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
712
713         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
714                 ioctl(master, TIOCSWINSZ, &ws);
715
716         if (unlockpt(master) < 0) {
717                 log_error("Failed to unlock tty: %m");
718                 goto finish;
719         }
720
721         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
722                 log_error("Failed to get terminal attributes: %m");
723                 goto finish;
724         }
725
726         saved_attr_valid = true;
727
728         raw_attr = saved_attr;
729         cfmakeraw(&raw_attr);
730         raw_attr.c_lflag &= ~ECHO;
731
732         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
733                 log_error("Failed to set terminal attributes: %m");
734                 goto finish;
735         }
736
737         assert_se(sigemptyset(&mask) == 0);
738         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
739         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
740
741         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
742         if (pid < 0) {
743                 if (errno == EINVAL)
744                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
745                 else
746                         log_error("clone() failed: %m");
747
748                 goto finish;
749         }
750
751         if (pid == 0) {
752                 /* child */
753
754                 const char *hn;
755                 const char *home = NULL;
756                 uid_t uid = (uid_t) -1;
757                 gid_t gid = (gid_t) -1;
758                 const char *envp[] = {
759                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
760                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
761                         NULL, /* TERM */
762                         NULL, /* HOME */
763                         NULL, /* USER */
764                         NULL, /* LOGNAME */
765                         NULL
766                 };
767
768                 envp[2] = strv_find_prefix(environ, "TERM=");
769
770                 close_nointr_nofail(master);
771
772                 close_nointr(STDIN_FILENO);
773                 close_nointr(STDOUT_FILENO);
774                 close_nointr(STDERR_FILENO);
775
776                 close_all_fds(NULL, 0);
777
778                 reset_all_signal_handlers();
779
780                 assert_se(sigemptyset(&mask) == 0);
781                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
782
783                 if (setsid() < 0)
784                         goto child_fail;
785
786                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
787                         goto child_fail;
788
789                 /* Mark / as private, in case somebody marked it shared */
790                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
791                         goto child_fail;
792
793                 if (mount_all(arg_directory) < 0)
794                         goto child_fail;
795
796                 if (copy_devnodes(arg_directory, console) < 0)
797                         goto child_fail;
798
799                 if (chdir(arg_directory) < 0) {
800                         log_error("chdir(%s) failed: %m", arg_directory);
801                         goto child_fail;
802                 }
803
804                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
805                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
806                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
807                         goto child_fail;
808
809                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
810                         log_error("mount(MS_MOVE) failed: %m");
811                         goto child_fail;
812                 }
813
814                 if (chroot(".") < 0) {
815                         log_error("chroot() failed: %m");
816                         goto child_fail;
817                 }
818
819                 if (chdir("/") < 0) {
820                         log_error("chdir() failed: %m");
821                         goto child_fail;
822                 }
823
824                 umask(0022);
825
826                 loopback_setup();
827
828                 if (drop_capabilities() < 0)
829                         goto child_fail;
830
831                 if (arg_user) {
832
833                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
834                                 log_error("get_user_creds() failed: %m");
835                                 goto child_fail;
836                         }
837
838                         if (mkdir_parents(home, 0775) < 0) {
839                                 log_error("mkdir_parents() failed: %m");
840                                 goto child_fail;
841                         }
842
843                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
844                                 log_error("safe_mkdir() failed: %m");
845                                 goto child_fail;
846                         }
847
848                         if (initgroups((const char*)arg_user, gid) < 0) {
849                                 log_error("initgroups() failed: %m");
850                                 goto child_fail;
851                         }
852
853                         if (setresgid(gid, gid, gid) < 0) {
854                                 log_error("setregid() failed: %m");
855                                 goto child_fail;
856                         }
857
858                         if (setresuid(uid, uid, uid) < 0) {
859                                 log_error("setreuid() failed: %m");
860                                 goto child_fail;
861                         }
862                 }
863
864                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
865                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
866                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
867                     log_error("Out of memory");
868                     goto child_fail;
869                 }
870
871                 if ((hn = file_name_from_path(arg_directory)))
872                         sethostname(hn, strlen(hn));
873
874                 if (argc > optind)
875                         execvpe(argv[optind], argv + optind, (char**) envp);
876                 else {
877                         chdir(home ? home : "/root");
878                         execle("/bin/bash", "-bash", NULL, (char**) envp);
879                 }
880
881                 log_error("execv() failed: %m");
882
883         child_fail:
884                 _exit(EXIT_FAILURE);
885         }
886
887         if (process_pty(master, &mask) < 0)
888                 goto finish;
889
890         if (saved_attr_valid) {
891                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
892                 saved_attr_valid = false;
893         }
894
895         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
896
897         if (r < 0)
898                 r = EXIT_FAILURE;
899
900 finish:
901         if (saved_attr_valid)
902                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
903
904         if (master >= 0)
905                 close_nointr_nofail(master);
906
907         if (oldcg)
908                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
909
910         if (newcg)
911                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
912
913         free(arg_directory);
914         strv_free(arg_controllers);
915         free(oldcg);
916         free(newcg);
917
918         return r;
919 }