chiark / gitweb /
relicense to LGPLv2.1 (with exceptions)
[elogind.git] / src / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU Lesser General Public License as published by
10   the Free Software Foundation; either version 2.1 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   Lesser General Public License for more details.
17
18   You should have received a copy of the GNU Lesser General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41
42 #include <systemd/sd-daemon.h>
43
44 #include "log.h"
45 #include "util.h"
46 #include "mkdir.h"
47 #include "audit.h"
48 #include "missing.h"
49 #include "cgroup-util.h"
50 #include "strv.h"
51 #include "loopback-setup.h"
52
53 static char *arg_directory = NULL;
54 static char *arg_user = NULL;
55 static bool arg_private_network = false;
56
57 static int help(void) {
58
59         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
60                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
61                "  -h --help            Show this help\n"
62                "  -D --directory=NAME  Root directory for the container\n"
63                "  -u --user=USER       Run the command under specified user or uid\n"
64                "     --private-network Disable network in container\n",
65                program_invocation_short_name);
66
67         return 0;
68 }
69
70 static int parse_argv(int argc, char *argv[]) {
71
72         enum {
73                 ARG_PRIVATE_NETWORK = 0x100
74         };
75
76         static const struct option options[] = {
77                 { "help",            no_argument,       NULL, 'h'                 },
78                 { "directory",       required_argument, NULL, 'D'                 },
79                 { "user",            required_argument, NULL, 'u'                 },
80                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
81                 { NULL,              0,                 NULL, 0                   }
82         };
83
84         int c;
85
86         assert(argc >= 0);
87         assert(argv);
88
89         while ((c = getopt_long(argc, argv, "+hD:u:", options, NULL)) >= 0) {
90
91                 switch (c) {
92
93                 case 'h':
94                         help();
95                         return 0;
96
97                 case 'D':
98                         free(arg_directory);
99                         if (!(arg_directory = strdup(optarg))) {
100                                 log_error("Failed to duplicate root directory.");
101                                 return -ENOMEM;
102                         }
103
104                         break;
105
106                 case 'u':
107                         free(arg_user);
108                         if (!(arg_user = strdup(optarg))) {
109                                 log_error("Failed to duplicate user name.");
110                                 return -ENOMEM;
111                         }
112
113                         break;
114
115                 case ARG_PRIVATE_NETWORK:
116                         arg_private_network = true;
117                         break;
118
119                 case '?':
120                         return -EINVAL;
121
122                 default:
123                         log_error("Unknown option code %c", c);
124                         return -EINVAL;
125                 }
126         }
127
128         return 1;
129 }
130
131 static int mount_all(const char *dest) {
132
133         typedef struct MountPoint {
134                 const char *what;
135                 const char *where;
136                 const char *type;
137                 const char *options;
138                 unsigned long flags;
139                 bool fatal;
140         } MountPoint;
141
142         static const MountPoint mount_table[] = {
143                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
144                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
145                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
146                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
147                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
148                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID|MS_STRICTATIME,     true  },
149                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
150                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV|MS_STRICTATIME, true  },
151 #ifdef HAVE_SELINUX
152                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
153                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
154 #endif
155         };
156
157         unsigned k;
158         int r = 0;
159         char *where;
160
161         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
162                 int t;
163
164                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
165                         log_error("Out of memory");
166
167                         if (r == 0)
168                                 r = -ENOMEM;
169
170                         break;
171                 }
172
173                 if ((t = path_is_mount_point(where, false)) < 0) {
174                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
175                         free(where);
176
177                         if (r == 0)
178                                 r = t;
179
180                         continue;
181                 }
182
183                 mkdir_p(where, 0755);
184
185                 if (mount(mount_table[k].what,
186                           where,
187                           mount_table[k].type,
188                           mount_table[k].flags,
189                           mount_table[k].options) < 0 &&
190                     mount_table[k].fatal) {
191
192                         log_error("mount(%s) failed: %m", where);
193
194                         if (r == 0)
195                                 r = -errno;
196                 }
197
198                 free(where);
199         }
200
201         /* Fix the timezone, if possible */
202         if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
203
204                 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
205                         mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
206
207                 free(where);
208         }
209
210         if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
211
212                 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
213                         mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
214
215                 free(where);
216         }
217
218         return r;
219 }
220
221 static int copy_devnodes(const char *dest, const char *console) {
222
223         static const char devnodes[] =
224                 "null\0"
225                 "zero\0"
226                 "full\0"
227                 "random\0"
228                 "urandom\0"
229                 "tty\0"
230                 "ptmx\0"
231                 "kmsg\0"
232                 "rtc0\0";
233
234         const char *d;
235         int r = 0, k;
236         mode_t u;
237         struct stat st;
238         char *from = NULL, *to = NULL;
239
240         assert(dest);
241         assert(console);
242
243         u = umask(0000);
244
245         NULSTR_FOREACH(d, devnodes) {
246                 from = to = NULL;
247
248                 asprintf(&from, "/dev/%s", d);
249                 asprintf(&to, "%s/dev/%s", dest, d);
250
251                 if (!from || !to) {
252                         log_error("Failed to allocate devnode path");
253
254                         free(from);
255                         free(to);
256
257                         from = to = NULL;
258
259                         if (r == 0)
260                                 r = -ENOMEM;
261
262                         break;
263                 }
264
265                 if (stat(from, &st) < 0) {
266
267                         if (errno != ENOENT) {
268                                 log_error("Failed to stat %s: %m", from);
269                                 if (r == 0)
270                                         r = -errno;
271                         }
272
273                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
274
275                         log_error("%s is not a char or block device, cannot copy.", from);
276                         if (r == 0)
277                                 r = -EIO;
278
279                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
280
281                         log_error("mknod(%s) failed: %m", dest);
282                         if (r == 0)
283                                 r = -errno;
284                 }
285
286                 free(from);
287                 free(to);
288         }
289
290         if (stat(console, &st) < 0) {
291
292                 log_error("Failed to stat %s: %m", console);
293                 if (r == 0)
294                         r = -errno;
295
296                 goto finish;
297
298         } else if (!S_ISCHR(st.st_mode)) {
299
300                 log_error("/dev/console is not a char device.");
301                 if (r == 0)
302                         r = -EIO;
303
304                 goto finish;
305         }
306
307         if (asprintf(&to, "%s/dev/console", dest) < 0) {
308
309                 log_error("Out of memory");
310                 if (r == 0)
311                         r = -ENOMEM;
312
313                  goto finish;
314         }
315
316         /* We need to bind mount the right tty to /dev/console since
317          * ptys can only exist on pts file systems. To have something
318          * to bind mount things on we create a device node first, that
319          * has the right major/minor (note that the major minor
320          * doesn't actually matter here, since we mount it over
321          * anyway). */
322
323         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
324                 log_error("mknod for /dev/console failed: %m");
325
326         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
327                 log_error("bind mount for /dev/console failed: %m");
328
329                 if (r == 0)
330                         r = -errno;
331         }
332
333         free(to);
334
335         if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
336                 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
337
338                 if (r == 0)
339                         r = k;
340         }
341
342 finish:
343         umask(u);
344
345         return r;
346 }
347
348 static int drop_capabilities(void) {
349         static const unsigned long retain[] = {
350                 CAP_CHOWN,
351                 CAP_DAC_OVERRIDE,
352                 CAP_DAC_READ_SEARCH,
353                 CAP_FOWNER,
354                 CAP_FSETID,
355                 CAP_IPC_OWNER,
356                 CAP_KILL,
357                 CAP_LEASE,
358                 CAP_LINUX_IMMUTABLE,
359                 CAP_NET_BIND_SERVICE,
360                 CAP_NET_BROADCAST,
361                 CAP_NET_RAW,
362                 CAP_SETGID,
363                 CAP_SETFCAP,
364                 CAP_SETPCAP,
365                 CAP_SETUID,
366                 CAP_SYS_ADMIN,
367                 CAP_SYS_CHROOT,
368                 CAP_SYS_NICE,
369                 CAP_SYS_PTRACE,
370                 CAP_SYS_TTY_CONFIG
371         };
372
373         unsigned long l;
374
375         for (l = 0; l <= cap_last_cap(); l++) {
376                 unsigned i;
377
378                 for (i = 0; i < ELEMENTSOF(retain); i++)
379                         if (retain[i] == l)
380                                 break;
381
382                 if (i < ELEMENTSOF(retain))
383                         continue;
384
385                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
386                         log_error("PR_CAPBSET_DROP failed: %m");
387                         return -errno;
388                 }
389         }
390
391         return 0;
392 }
393
394 static int is_os_tree(const char *path) {
395         int r;
396         char *p;
397         /* We use /bin/sh as flag file if something is an OS */
398
399         if (asprintf(&p, "%s/bin/sh", path) < 0)
400                 return -ENOMEM;
401
402         r = access(p, F_OK);
403         free(p);
404
405         return r < 0 ? 0 : 1;
406 }
407
408 static int process_pty(int master, sigset_t *mask) {
409
410         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
411         size_t in_buffer_full = 0, out_buffer_full = 0;
412         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
413         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
414         int ep = -1, signal_fd = -1, r;
415
416         fd_nonblock(STDIN_FILENO, 1);
417         fd_nonblock(STDOUT_FILENO, 1);
418         fd_nonblock(master, 1);
419
420         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
421                 log_error("signalfd(): %m");
422                 r = -errno;
423                 goto finish;
424         }
425
426         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
427                 log_error("Failed to create epoll: %m");
428                 r = -errno;
429                 goto finish;
430         }
431
432         zero(stdin_ev);
433         stdin_ev.events = EPOLLIN|EPOLLET;
434         stdin_ev.data.fd = STDIN_FILENO;
435
436         zero(stdout_ev);
437         stdout_ev.events = EPOLLOUT|EPOLLET;
438         stdout_ev.data.fd = STDOUT_FILENO;
439
440         zero(master_ev);
441         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
442         master_ev.data.fd = master;
443
444         zero(signal_ev);
445         signal_ev.events = EPOLLIN;
446         signal_ev.data.fd = signal_fd;
447
448         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
449             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
450             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
451             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
452                 log_error("Failed to regiser fds in epoll: %m");
453                 r = -errno;
454                 goto finish;
455         }
456
457         for (;;) {
458                 struct epoll_event ev[16];
459                 ssize_t k;
460                 int i, nfds;
461
462                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
463
464                         if (errno == EINTR || errno == EAGAIN)
465                                 continue;
466
467                         log_error("epoll_wait(): %m");
468                         r = -errno;
469                         goto finish;
470                 }
471
472                 assert(nfds >= 1);
473
474                 for (i = 0; i < nfds; i++) {
475                         if (ev[i].data.fd == STDIN_FILENO) {
476
477                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
478                                         stdin_readable = true;
479
480                         } else if (ev[i].data.fd == STDOUT_FILENO) {
481
482                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
483                                         stdout_writable = true;
484
485                         } else if (ev[i].data.fd == master) {
486
487                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
488                                         master_readable = true;
489
490                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
491                                         master_writable = true;
492
493                         } else if (ev[i].data.fd == signal_fd) {
494                                 struct signalfd_siginfo sfsi;
495                                 ssize_t n;
496
497                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
498
499                                         if (n >= 0) {
500                                                 log_error("Failed to read from signalfd: invalid block size");
501                                                 r = -EIO;
502                                                 goto finish;
503                                         }
504
505                                         if (errno != EINTR && errno != EAGAIN) {
506                                                 log_error("Failed to read from signalfd: %m");
507                                                 r = -errno;
508                                                 goto finish;
509                                         }
510                                 } else {
511
512                                         if (sfsi.ssi_signo == SIGWINCH) {
513                                                 struct winsize ws;
514
515                                                 /* The window size changed, let's forward that. */
516                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
517                                                         ioctl(master, TIOCSWINSZ, &ws);
518                                         } else {
519                                                 r = 0;
520                                                 goto finish;
521                                         }
522                                 }
523                         }
524                 }
525
526                 while ((stdin_readable && in_buffer_full <= 0) ||
527                        (master_writable && in_buffer_full > 0) ||
528                        (master_readable && out_buffer_full <= 0) ||
529                        (stdout_writable && out_buffer_full > 0)) {
530
531                         if (stdin_readable && in_buffer_full < LINE_MAX) {
532
533                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
534
535                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
536                                                 stdin_readable = false;
537                                         else {
538                                                 log_error("read(): %m");
539                                                 r = -errno;
540                                                 goto finish;
541                                         }
542                                 } else
543                                         in_buffer_full += (size_t) k;
544                         }
545
546                         if (master_writable && in_buffer_full > 0) {
547
548                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
549
550                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
551                                                 master_writable = false;
552                                         else {
553                                                 log_error("write(): %m");
554                                                 r = -errno;
555                                                 goto finish;
556                                         }
557
558                                 } else {
559                                         assert(in_buffer_full >= (size_t) k);
560                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
561                                         in_buffer_full -= k;
562                                 }
563                         }
564
565                         if (master_readable && out_buffer_full < LINE_MAX) {
566
567                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
568
569                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
570                                                 master_readable = false;
571                                         else {
572                                                 log_error("read(): %m");
573                                                 r = -errno;
574                                                 goto finish;
575                                         }
576                                 }  else
577                                         out_buffer_full += (size_t) k;
578                         }
579
580                         if (stdout_writable && out_buffer_full > 0) {
581
582                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
583
584                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
585                                                 stdout_writable = false;
586                                         else {
587                                                 log_error("write(): %m");
588                                                 r = -errno;
589                                                 goto finish;
590                                         }
591
592                                 } else {
593                                         assert(out_buffer_full >= (size_t) k);
594                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
595                                         out_buffer_full -= k;
596                                 }
597                         }
598                 }
599         }
600
601 finish:
602         if (ep >= 0)
603                 close_nointr_nofail(ep);
604
605         if (signal_fd >= 0)
606                 close_nointr_nofail(signal_fd);
607
608         return r;
609 }
610
611 int main(int argc, char *argv[]) {
612         pid_t pid = 0;
613         int r = EXIT_FAILURE, k;
614         char *oldcg = NULL, *newcg = NULL;
615         int master = -1;
616         const char *console = NULL;
617         struct termios saved_attr, raw_attr;
618         sigset_t mask;
619         bool saved_attr_valid = false;
620         struct winsize ws;
621
622         log_parse_environment();
623         log_open();
624
625         if ((r = parse_argv(argc, argv)) <= 0)
626                 goto finish;
627
628         if (arg_directory) {
629                 char *p;
630
631                 p = path_make_absolute_cwd(arg_directory);
632                 free(arg_directory);
633                 arg_directory = p;
634         } else
635                 arg_directory = get_current_dir_name();
636
637         if (!arg_directory) {
638                 log_error("Failed to determine path");
639                 goto finish;
640         }
641
642         path_kill_slashes(arg_directory);
643
644         if (geteuid() != 0) {
645                 log_error("Need to be root.");
646                 goto finish;
647         }
648
649         if (sd_booted() <= 0) {
650                 log_error("Not running on a systemd system.");
651                 goto finish;
652         }
653
654         if (path_equal(arg_directory, "/")) {
655                 log_error("Spawning container on root directory not supported.");
656                 goto finish;
657         }
658
659         if (is_os_tree(arg_directory) <= 0) {
660                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
661                 goto finish;
662         }
663
664         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
665                 log_error("Failed to determine current cgroup: %s", strerror(-k));
666                 goto finish;
667         }
668
669         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
670                 log_error("Failed to allocate cgroup path.");
671                 goto finish;
672         }
673
674         if ((k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0)) < 0)  {
675                 log_error("Failed to create cgroup: %s", strerror(-k));
676                 goto finish;
677         }
678
679         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
680                 log_error("Failed to acquire pseudo tty: %m");
681                 goto finish;
682         }
683
684         if (!(console = ptsname(master))) {
685                 log_error("Failed to determine tty name: %m");
686                 goto finish;
687         }
688
689         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
690
691         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
692                 ioctl(master, TIOCSWINSZ, &ws);
693
694         if (unlockpt(master) < 0) {
695                 log_error("Failed to unlock tty: %m");
696                 goto finish;
697         }
698
699         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
700                 log_error("Failed to get terminal attributes: %m");
701                 goto finish;
702         }
703
704         saved_attr_valid = true;
705
706         raw_attr = saved_attr;
707         cfmakeraw(&raw_attr);
708         raw_attr.c_lflag &= ~ECHO;
709
710         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
711                 log_error("Failed to set terminal attributes: %m");
712                 goto finish;
713         }
714
715         assert_se(sigemptyset(&mask) == 0);
716         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
717         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
718
719         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
720         if (pid < 0) {
721                 if (errno == EINVAL)
722                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
723                 else
724                         log_error("clone() failed: %m");
725
726                 goto finish;
727         }
728
729         if (pid == 0) {
730                 /* child */
731
732                 const char *hn;
733                 const char *home = NULL;
734                 uid_t uid = (uid_t) -1;
735                 gid_t gid = (gid_t) -1;
736                 const char *envp[] = {
737                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
738                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
739                         NULL, /* TERM */
740                         NULL, /* HOME */
741                         NULL, /* USER */
742                         NULL, /* LOGNAME */
743                         NULL
744                 };
745
746                 envp[2] = strv_find_prefix(environ, "TERM=");
747
748                 close_nointr_nofail(master);
749
750                 close_nointr(STDIN_FILENO);
751                 close_nointr(STDOUT_FILENO);
752                 close_nointr(STDERR_FILENO);
753
754                 close_all_fds(NULL, 0);
755
756                 reset_all_signal_handlers();
757
758                 assert_se(sigemptyset(&mask) == 0);
759                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
760
761                 if (setsid() < 0)
762                         goto child_fail;
763
764                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
765                         goto child_fail;
766
767                 /* Mark / as private, in case somebody marked it shared */
768                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
769                         goto child_fail;
770
771                 if (mount_all(arg_directory) < 0)
772                         goto child_fail;
773
774                 if (copy_devnodes(arg_directory, console) < 0)
775                         goto child_fail;
776
777                 if (chdir(arg_directory) < 0) {
778                         log_error("chdir(%s) failed: %m", arg_directory);
779                         goto child_fail;
780                 }
781
782                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
783                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
784                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
785                         goto child_fail;
786
787                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
788                         log_error("mount(MS_MOVE) failed: %m");
789                         goto child_fail;
790                 }
791
792                 if (chroot(".") < 0) {
793                         log_error("chroot() failed: %m");
794                         goto child_fail;
795                 }
796
797                 if (chdir("/") < 0) {
798                         log_error("chdir() failed: %m");
799                         goto child_fail;
800                 }
801
802                 umask(0022);
803
804                 loopback_setup();
805
806                 if (drop_capabilities() < 0)
807                         goto child_fail;
808
809                 if (arg_user) {
810
811                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
812                                 log_error("get_user_creds() failed: %m");
813                                 goto child_fail;
814                         }
815
816                         if (mkdir_parents(home, 0775) < 0) {
817                                 log_error("mkdir_parents() failed: %m");
818                                 goto child_fail;
819                         }
820
821                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
822                                 log_error("safe_mkdir() failed: %m");
823                                 goto child_fail;
824                         }
825
826                         if (initgroups((const char*)arg_user, gid) < 0) {
827                                 log_error("initgroups() failed: %m");
828                                 goto child_fail;
829                         }
830
831                         if (setresgid(gid, gid, gid) < 0) {
832                                 log_error("setregid() failed: %m");
833                                 goto child_fail;
834                         }
835
836                         if (setresuid(uid, uid, uid) < 0) {
837                                 log_error("setreuid() failed: %m");
838                                 goto child_fail;
839                         }
840                 }
841
842                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
843                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
844                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
845                     log_error("Out of memory");
846                     goto child_fail;
847                 }
848
849                 if ((hn = file_name_from_path(arg_directory)))
850                         sethostname(hn, strlen(hn));
851
852                 if (argc > optind)
853                         execvpe(argv[optind], argv + optind, (char**) envp);
854                 else {
855                         chdir(home ? home : "/root");
856                         execle("/bin/bash", "-bash", NULL, (char**) envp);
857                 }
858
859                 log_error("execv() failed: %m");
860
861         child_fail:
862                 _exit(EXIT_FAILURE);
863         }
864
865         if (process_pty(master, &mask) < 0)
866                 goto finish;
867
868         if (saved_attr_valid) {
869                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
870                 saved_attr_valid = false;
871         }
872
873         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
874
875         if (r < 0)
876                 r = EXIT_FAILURE;
877
878 finish:
879         if (saved_attr_valid)
880                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
881
882         if (master >= 0)
883                 close_nointr_nofail(master);
884
885         if (oldcg)
886                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
887
888         if (newcg)
889                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
890
891         free(arg_directory);
892         free(oldcg);
893         free(newcg);
894
895         return r;
896 }