chiark / gitweb /
a couple of fixes to make llvm-analyze quiet
[elogind.git] / src / nspawn.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4   This file is part of systemd.
5
6   Copyright 2010 Lennart Poettering
7
8   systemd is free software; you can redistribute it and/or modify it
9   under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2 of the License, or
11   (at your option) any later version.
12
13   systemd is distributed in the hope that it will be useful, but
14   WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16   General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <signal.h>
23 #include <sched.h>
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/syscall.h>
27 #include <sys/mount.h>
28 #include <sys/wait.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <stdio.h>
32 #include <errno.h>
33 #include <sys/prctl.h>
34 #include <sys/capability.h>
35 #include <getopt.h>
36 #include <sys/epoll.h>
37 #include <termios.h>
38 #include <sys/signalfd.h>
39 #include <grp.h>
40 #include <linux/fs.h>
41
42 #include <systemd/sd-daemon.h>
43
44 #include "log.h"
45 #include "util.h"
46 #include "missing.h"
47 #include "cgroup-util.h"
48 #include "strv.h"
49 #include "loopback-setup.h"
50
51 static char *arg_directory = NULL;
52 static char *arg_user = NULL;
53 static bool arg_private_network = false;
54
55 static int help(void) {
56
57         printf("%s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
58                "Spawn a minimal namespace container for debugging, testing and building.\n\n"
59                "  -h --help            Show this help\n"
60                "  -D --directory=NAME  Root directory for the container\n"
61                "  -u --user=USER       Run the command under specified user or uid\n"
62                "     --private-network Disable network in container\n",
63                program_invocation_short_name);
64
65         return 0;
66 }
67
68 static int parse_argv(int argc, char *argv[]) {
69
70         enum {
71                 ARG_PRIVATE_NETWORK = 0x100
72         };
73
74         static const struct option options[] = {
75                 { "help",            no_argument,       NULL, 'h'                 },
76                 { "directory",       required_argument, NULL, 'D'                 },
77                 { "user",            required_argument, NULL, 'u'                 },
78                 { "private-network", no_argument,       NULL, ARG_PRIVATE_NETWORK },
79                 { NULL,              0,                 NULL, 0                   }
80         };
81
82         int c;
83
84         assert(argc >= 0);
85         assert(argv);
86
87         while ((c = getopt_long(argc, argv, "+hD:u:", options, NULL)) >= 0) {
88
89                 switch (c) {
90
91                 case 'h':
92                         help();
93                         return 0;
94
95                 case 'D':
96                         free(arg_directory);
97                         if (!(arg_directory = strdup(optarg))) {
98                                 log_error("Failed to duplicate root directory.");
99                                 return -ENOMEM;
100                         }
101
102                         break;
103
104                 case 'u':
105                         free(arg_user);
106                         if (!(arg_user = strdup(optarg))) {
107                                 log_error("Failed to duplicate user name.");
108                                 return -ENOMEM;
109                         }
110
111                         break;
112
113                 case ARG_PRIVATE_NETWORK:
114                         arg_private_network = true;
115                         break;
116
117                 case '?':
118                         return -EINVAL;
119
120                 default:
121                         log_error("Unknown option code %c", c);
122                         return -EINVAL;
123                 }
124         }
125
126         return 1;
127 }
128
129 static int mount_all(const char *dest) {
130
131         typedef struct MountPoint {
132                 const char *what;
133                 const char *where;
134                 const char *type;
135                 const char *options;
136                 unsigned long flags;
137                 bool fatal;
138         } MountPoint;
139
140         static const MountPoint mount_table[] = {
141                 { "proc",      "/proc",     "proc",  NULL,       MS_NOSUID|MS_NOEXEC|MS_NODEV, true  },
142                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND, true                       },   /* Bind mount first */
143                 { "/proc/sys", "/proc/sys", "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
144                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND,                      true  },   /* Bind mount first */
145                 { "/sys",      "/sys",      "bind",  NULL,       MS_BIND|MS_RDONLY|MS_REMOUNT, true  },   /* Then, make it r/o */
146                 { "tmpfs",     "/dev",      "tmpfs", "mode=755", MS_NOSUID,                    true  },
147                 { "/dev/pts",  "/dev/pts",  "bind",  NULL,       MS_BIND,                      true  },
148                 { "tmpfs",     "/run",      "tmpfs", "mode=755", MS_NOSUID|MS_NODEV,           true  },
149 #ifdef HAVE_SELINUX
150                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND,                      false },  /* Bind mount first */
151                 { "/sys/fs/selinux", "/sys/fs/selinux", "bind", NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, false },  /* Then, make it r/o */
152 #endif
153         };
154
155         unsigned k;
156         int r = 0;
157         char *where;
158
159         for (k = 0; k < ELEMENTSOF(mount_table); k++) {
160                 int t;
161
162                 if (asprintf(&where, "%s/%s", dest, mount_table[k].where) < 0) {
163                         log_error("Out of memory");
164
165                         if (r == 0)
166                                 r = -ENOMEM;
167
168                         break;
169                 }
170
171                 if ((t = path_is_mount_point(where, false)) < 0) {
172                         log_error("Failed to detect whether %s is a mount point: %s", where, strerror(-t));
173                         free(where);
174
175                         if (r == 0)
176                                 r = t;
177
178                         continue;
179                 }
180
181                 mkdir_p(where, 0755);
182
183                 if (mount(mount_table[k].what,
184                           where,
185                           mount_table[k].type,
186                           mount_table[k].flags,
187                           mount_table[k].options) < 0 &&
188                     mount_table[k].fatal) {
189
190                         log_error("mount(%s) failed: %m", where);
191
192                         if (r == 0)
193                                 r = -errno;
194                 }
195
196                 free(where);
197         }
198
199         /* Fix the timezone, if possible */
200         if (asprintf(&where, "%s/etc/localtime", dest) >= 0) {
201
202                 if (mount("/etc/localtime", where, "bind", MS_BIND, NULL) >= 0)
203                         mount("/etc/localtime", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
204
205                 free(where);
206         }
207
208         if (asprintf(&where, "%s/etc/timezone", dest) >= 0) {
209
210                 if (mount("/etc/timezone", where, "bind", MS_BIND, NULL) >= 0)
211                         mount("/etc/timezone", where, "bind", MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
212
213                 free(where);
214         }
215
216         return r;
217 }
218
219 static int copy_devnodes(const char *dest, const char *console) {
220
221         static const char devnodes[] =
222                 "null\0"
223                 "zero\0"
224                 "full\0"
225                 "random\0"
226                 "urandom\0"
227                 "tty\0"
228                 "ptmx\0"
229                 "kmsg\0"
230                 "rtc0\0";
231
232         const char *d;
233         int r = 0, k;
234         mode_t u;
235         struct stat st;
236         char *from = NULL, *to = NULL;
237
238         assert(dest);
239         assert(console);
240
241         u = umask(0000);
242
243         NULSTR_FOREACH(d, devnodes) {
244                 from = to = NULL;
245
246                 asprintf(&from, "/dev/%s", d);
247                 asprintf(&to, "%s/dev/%s", dest, d);
248
249                 if (!from || !to) {
250                         log_error("Failed to allocate devnode path");
251
252                         free(from);
253                         free(to);
254
255                         from = to = NULL;
256
257                         if (r == 0)
258                                 r = -ENOMEM;
259
260                         break;
261                 }
262
263                 if (stat(from, &st) < 0) {
264
265                         if (errno != ENOENT) {
266                                 log_error("Failed to stat %s: %m", from);
267                                 if (r == 0)
268                                         r = -errno;
269                         }
270
271                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) {
272
273                         log_error("%s is not a char or block device, cannot copy.", from);
274                         if (r == 0)
275                                 r = -EIO;
276
277                 } else if (mknod(to, st.st_mode, st.st_rdev) < 0) {
278
279                         log_error("mknod(%s) failed: %m", dest);
280                         if (r == 0)
281                                 r = -errno;
282                 }
283
284                 free(from);
285                 free(to);
286         }
287
288         if (stat(console, &st) < 0) {
289
290                 log_error("Failed to stat %s: %m", console);
291                 if (r == 0)
292                         r = -errno;
293
294                 goto finish;
295
296         } else if (!S_ISCHR(st.st_mode)) {
297
298                 log_error("/dev/console is not a char device.");
299                 if (r == 0)
300                         r = -EIO;
301
302                 goto finish;
303         }
304
305         if (asprintf(&to, "%s/dev/console", dest) < 0) {
306
307                 log_error("Out of memory");
308                 if (r == 0)
309                         r = -ENOMEM;
310
311                  goto finish;
312         }
313
314         /* We need to bind mount the right tty to /dev/console since
315          * ptys can only exist on pts file systems. To have something
316          * to bind mount things on we create a device node first, that
317          * has the right major/minor (note that the major minor
318          * doesn't actually matter here, since we mount it over
319          * anyway). */
320
321         if (mknod(to, (st.st_mode & ~07777) | 0600, st.st_rdev) < 0)
322                 log_error("mknod for /dev/console failed: %m");
323
324         if (mount(console, to, "bind", MS_BIND, NULL) < 0) {
325                 log_error("bind mount for /dev/console failed: %m");
326
327                 if (r == 0)
328                         r = -errno;
329         }
330
331         free(to);
332
333         if ((k = chmod_and_chown(console, 0600, 0, 0)) < 0) {
334                 log_error("Failed to correct access mode for TTY: %s", strerror(-k));
335
336                 if (r == 0)
337                         r = k;
338         }
339
340 finish:
341         umask(u);
342
343         return r;
344 }
345
346 static int drop_capabilities(void) {
347         static const unsigned long retain[] = {
348                 CAP_CHOWN,
349                 CAP_DAC_OVERRIDE,
350                 CAP_DAC_READ_SEARCH,
351                 CAP_FOWNER,
352                 CAP_FSETID,
353                 CAP_IPC_OWNER,
354                 CAP_KILL,
355                 CAP_LEASE,
356                 CAP_LINUX_IMMUTABLE,
357                 CAP_NET_BIND_SERVICE,
358                 CAP_NET_BROADCAST,
359                 CAP_NET_RAW,
360                 CAP_SETGID,
361                 CAP_SETFCAP,
362                 CAP_SETPCAP,
363                 CAP_SETUID,
364                 CAP_SYS_ADMIN,
365                 CAP_SYS_CHROOT,
366                 CAP_SYS_NICE,
367                 CAP_SYS_PTRACE,
368                 CAP_SYS_TTY_CONFIG
369         };
370
371         unsigned long l;
372
373         for (l = 0; l <= cap_last_cap(); l++) {
374                 unsigned i;
375
376                 for (i = 0; i < ELEMENTSOF(retain); i++)
377                         if (retain[i] == l)
378                                 break;
379
380                 if (i < ELEMENTSOF(retain))
381                         continue;
382
383                 if (prctl(PR_CAPBSET_DROP, l) < 0) {
384                         log_error("PR_CAPBSET_DROP failed: %m");
385                         return -errno;
386                 }
387         }
388
389         return 0;
390 }
391
392 static int is_os_tree(const char *path) {
393         int r;
394         char *p;
395         /* We use /bin/sh as flag file if something is an OS */
396
397         if (asprintf(&p, "%s/bin/sh", path) < 0)
398                 return -ENOMEM;
399
400         r = access(p, F_OK);
401         free(p);
402
403         return r < 0 ? 0 : 1;
404 }
405
406 static int process_pty(int master, sigset_t *mask) {
407
408         char in_buffer[LINE_MAX], out_buffer[LINE_MAX];
409         size_t in_buffer_full = 0, out_buffer_full = 0;
410         struct epoll_event stdin_ev, stdout_ev, master_ev, signal_ev;
411         bool stdin_readable = false, stdout_writable = false, master_readable = false, master_writable = false;
412         int ep = -1, signal_fd = -1, r;
413
414         fd_nonblock(STDIN_FILENO, 1);
415         fd_nonblock(STDOUT_FILENO, 1);
416         fd_nonblock(master, 1);
417
418         if ((signal_fd = signalfd(-1, mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
419                 log_error("signalfd(): %m");
420                 r = -errno;
421                 goto finish;
422         }
423
424         if ((ep = epoll_create1(EPOLL_CLOEXEC)) < 0) {
425                 log_error("Failed to create epoll: %m");
426                 r = -errno;
427                 goto finish;
428         }
429
430         zero(stdin_ev);
431         stdin_ev.events = EPOLLIN|EPOLLET;
432         stdin_ev.data.fd = STDIN_FILENO;
433
434         zero(stdout_ev);
435         stdout_ev.events = EPOLLOUT|EPOLLET;
436         stdout_ev.data.fd = STDOUT_FILENO;
437
438         zero(master_ev);
439         master_ev.events = EPOLLIN|EPOLLOUT|EPOLLET;
440         master_ev.data.fd = master;
441
442         zero(signal_ev);
443         signal_ev.events = EPOLLIN;
444         signal_ev.data.fd = signal_fd;
445
446         if (epoll_ctl(ep, EPOLL_CTL_ADD, STDIN_FILENO, &stdin_ev) < 0 ||
447             epoll_ctl(ep, EPOLL_CTL_ADD, STDOUT_FILENO, &stdout_ev) < 0 ||
448             epoll_ctl(ep, EPOLL_CTL_ADD, master, &master_ev) < 0 ||
449             epoll_ctl(ep, EPOLL_CTL_ADD, signal_fd, &signal_ev) < 0) {
450                 log_error("Failed to regiser fds in epoll: %m");
451                 r = -errno;
452                 goto finish;
453         }
454
455         for (;;) {
456                 struct epoll_event ev[16];
457                 ssize_t k;
458                 int i, nfds;
459
460                 if ((nfds = epoll_wait(ep, ev, ELEMENTSOF(ev), -1)) < 0) {
461
462                         if (errno == EINTR || errno == EAGAIN)
463                                 continue;
464
465                         log_error("epoll_wait(): %m");
466                         r = -errno;
467                         goto finish;
468                 }
469
470                 assert(nfds >= 1);
471
472                 for (i = 0; i < nfds; i++) {
473                         if (ev[i].data.fd == STDIN_FILENO) {
474
475                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
476                                         stdin_readable = true;
477
478                         } else if (ev[i].data.fd == STDOUT_FILENO) {
479
480                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
481                                         stdout_writable = true;
482
483                         } else if (ev[i].data.fd == master) {
484
485                                 if (ev[i].events & (EPOLLIN|EPOLLHUP))
486                                         master_readable = true;
487
488                                 if (ev[i].events & (EPOLLOUT|EPOLLHUP))
489                                         master_writable = true;
490
491                         } else if (ev[i].data.fd == signal_fd) {
492                                 struct signalfd_siginfo sfsi;
493                                 ssize_t n;
494
495                                 if ((n = read(signal_fd, &sfsi, sizeof(sfsi))) != sizeof(sfsi)) {
496
497                                         if (n >= 0) {
498                                                 log_error("Failed to read from signalfd: invalid block size");
499                                                 r = -EIO;
500                                                 goto finish;
501                                         }
502
503                                         if (errno != EINTR && errno != EAGAIN) {
504                                                 log_error("Failed to read from signalfd: %m");
505                                                 r = -errno;
506                                                 goto finish;
507                                         }
508                                 } else {
509
510                                         if (sfsi.ssi_signo == SIGWINCH) {
511                                                 struct winsize ws;
512
513                                                 /* The window size changed, let's forward that. */
514                                                 if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
515                                                         ioctl(master, TIOCSWINSZ, &ws);
516                                         } else {
517                                                 r = 0;
518                                                 goto finish;
519                                         }
520                                 }
521                         }
522                 }
523
524                 while ((stdin_readable && in_buffer_full <= 0) ||
525                        (master_writable && in_buffer_full > 0) ||
526                        (master_readable && out_buffer_full <= 0) ||
527                        (stdout_writable && out_buffer_full > 0)) {
528
529                         if (stdin_readable && in_buffer_full < LINE_MAX) {
530
531                                 if ((k = read(STDIN_FILENO, in_buffer + in_buffer_full, LINE_MAX - in_buffer_full)) < 0) {
532
533                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
534                                                 stdin_readable = false;
535                                         else {
536                                                 log_error("read(): %m");
537                                                 r = -errno;
538                                                 goto finish;
539                                         }
540                                 } else
541                                         in_buffer_full += (size_t) k;
542                         }
543
544                         if (master_writable && in_buffer_full > 0) {
545
546                                 if ((k = write(master, in_buffer, in_buffer_full)) < 0) {
547
548                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
549                                                 master_writable = false;
550                                         else {
551                                                 log_error("write(): %m");
552                                                 r = -errno;
553                                                 goto finish;
554                                         }
555
556                                 } else {
557                                         assert(in_buffer_full >= (size_t) k);
558                                         memmove(in_buffer, in_buffer + k, in_buffer_full - k);
559                                         in_buffer_full -= k;
560                                 }
561                         }
562
563                         if (master_readable && out_buffer_full < LINE_MAX) {
564
565                                 if ((k = read(master, out_buffer + out_buffer_full, LINE_MAX - out_buffer_full)) < 0) {
566
567                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
568                                                 master_readable = false;
569                                         else {
570                                                 log_error("read(): %m");
571                                                 r = -errno;
572                                                 goto finish;
573                                         }
574                                 }  else
575                                         out_buffer_full += (size_t) k;
576                         }
577
578                         if (stdout_writable && out_buffer_full > 0) {
579
580                                 if ((k = write(STDOUT_FILENO, out_buffer, out_buffer_full)) < 0) {
581
582                                         if (errno == EAGAIN || errno == EPIPE || errno == ECONNRESET || errno == EIO)
583                                                 stdout_writable = false;
584                                         else {
585                                                 log_error("write(): %m");
586                                                 r = -errno;
587                                                 goto finish;
588                                         }
589
590                                 } else {
591                                         assert(out_buffer_full >= (size_t) k);
592                                         memmove(out_buffer, out_buffer + k, out_buffer_full - k);
593                                         out_buffer_full -= k;
594                                 }
595                         }
596                 }
597         }
598
599 finish:
600         if (ep >= 0)
601                 close_nointr_nofail(ep);
602
603         if (signal_fd >= 0)
604                 close_nointr_nofail(signal_fd);
605
606         return r;
607 }
608
609 int main(int argc, char *argv[]) {
610         pid_t pid = 0;
611         int r = EXIT_FAILURE, k;
612         char *oldcg = NULL, *newcg = NULL;
613         int master = -1;
614         const char *console = NULL;
615         struct termios saved_attr, raw_attr;
616         sigset_t mask;
617         bool saved_attr_valid = false;
618         struct winsize ws;
619
620         log_parse_environment();
621         log_open();
622
623         if ((r = parse_argv(argc, argv)) <= 0)
624                 goto finish;
625
626         if (arg_directory) {
627                 char *p;
628
629                 p = path_make_absolute_cwd(arg_directory);
630                 free(arg_directory);
631                 arg_directory = p;
632         } else
633                 arg_directory = get_current_dir_name();
634
635         if (!arg_directory) {
636                 log_error("Failed to determine path");
637                 goto finish;
638         }
639
640         path_kill_slashes(arg_directory);
641
642         if (geteuid() != 0) {
643                 log_error("Need to be root.");
644                 goto finish;
645         }
646
647         if (sd_booted() <= 0) {
648                 log_error("Not running on a systemd system.");
649                 goto finish;
650         }
651
652         if (path_equal(arg_directory, "/")) {
653                 log_error("Spawning container on root directory not supported.");
654                 goto finish;
655         }
656
657         if (is_os_tree(arg_directory) <= 0) {
658                 log_error("Directory %s doesn't look like an OS root directory. Refusing.", arg_directory);
659                 goto finish;
660         }
661
662         if ((k = cg_get_by_pid(SYSTEMD_CGROUP_CONTROLLER, 0, &oldcg)) < 0) {
663                 log_error("Failed to determine current cgroup: %s", strerror(-k));
664                 goto finish;
665         }
666
667         if (asprintf(&newcg, "%s/nspawn-%lu", oldcg, (unsigned long) getpid()) < 0) {
668                 log_error("Failed to allocate cgroup path.");
669                 goto finish;
670         }
671
672         if ((k = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, newcg, 0)) < 0)  {
673                 log_error("Failed to create cgroup: %s", strerror(-k));
674                 goto finish;
675         }
676
677         if ((master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY)) < 0) {
678                 log_error("Failed to acquire pseudo tty: %m");
679                 goto finish;
680         }
681
682         if (!(console = ptsname(master))) {
683                 log_error("Failed to determine tty name: %m");
684                 goto finish;
685         }
686
687         log_info("Spawning namespace container on %s (console is %s).", arg_directory, console);
688
689         if (ioctl(STDIN_FILENO, TIOCGWINSZ, &ws) >= 0)
690                 ioctl(master, TIOCSWINSZ, &ws);
691
692         if (unlockpt(master) < 0) {
693                 log_error("Failed to unlock tty: %m");
694                 goto finish;
695         }
696
697         if (tcgetattr(STDIN_FILENO, &saved_attr) < 0) {
698                 log_error("Failed to get terminal attributes: %m");
699                 goto finish;
700         }
701
702         saved_attr_valid = true;
703
704         raw_attr = saved_attr;
705         cfmakeraw(&raw_attr);
706         raw_attr.c_lflag &= ~ECHO;
707
708         if (tcsetattr(STDIN_FILENO, TCSANOW, &raw_attr) < 0) {
709                 log_error("Failed to set terminal attributes: %m");
710                 goto finish;
711         }
712
713         assert_se(sigemptyset(&mask) == 0);
714         sigset_add_many(&mask, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1);
715         assert_se(sigprocmask(SIG_BLOCK, &mask, NULL) == 0);
716
717         pid = syscall(__NR_clone, SIGCHLD|CLONE_NEWIPC|CLONE_NEWNS|CLONE_NEWPID|CLONE_NEWUTS|(arg_private_network ? CLONE_NEWNET : 0), NULL);
718         if (pid < 0) {
719                 if (errno == EINVAL)
720                         log_error("clone() failed, do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in): %m");
721                 else
722                         log_error("clone() failed: %m");
723
724                 goto finish;
725         }
726
727         if (pid == 0) {
728                 /* child */
729
730                 const char *hn;
731                 const char *home = NULL;
732                 uid_t uid = (uid_t) -1;
733                 gid_t gid = (gid_t) -1;
734                 const char *envp[] = {
735                         "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
736                         "container=systemd-nspawn", /* LXC sets container=lxc, so follow the scheme here */
737                         NULL, /* TERM */
738                         NULL, /* HOME */
739                         NULL, /* USER */
740                         NULL, /* LOGNAME */
741                         NULL
742                 };
743
744                 envp[2] = strv_find_prefix(environ, "TERM=");
745
746                 close_nointr_nofail(master);
747
748                 close_nointr(STDIN_FILENO);
749                 close_nointr(STDOUT_FILENO);
750                 close_nointr(STDERR_FILENO);
751
752                 close_all_fds(NULL, 0);
753
754                 reset_all_signal_handlers();
755
756                 assert_se(sigemptyset(&mask) == 0);
757                 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
758
759                 if (setsid() < 0)
760                         goto child_fail;
761
762                 if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
763                         goto child_fail;
764
765                 /* Mark / as private, in case somebody marked it shared */
766                 if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) < 0)
767                         goto child_fail;
768
769                 if (mount_all(arg_directory) < 0)
770                         goto child_fail;
771
772                 if (copy_devnodes(arg_directory, console) < 0)
773                         goto child_fail;
774
775                 if (chdir(arg_directory) < 0) {
776                         log_error("chdir(%s) failed: %m", arg_directory);
777                         goto child_fail;
778                 }
779
780                 if (open_terminal("dev/console", O_RDWR) != STDIN_FILENO ||
781                     dup2(STDIN_FILENO, STDOUT_FILENO) != STDOUT_FILENO ||
782                     dup2(STDIN_FILENO, STDERR_FILENO) != STDERR_FILENO)
783                         goto child_fail;
784
785                 if (mount(arg_directory, "/", "bind", MS_BIND|MS_MOVE, NULL) < 0) {
786                         log_error("mount(MS_MOVE) failed: %m");
787                         goto child_fail;
788                 }
789
790                 if (chroot(".") < 0) {
791                         log_error("chroot() failed: %m");
792                         goto child_fail;
793                 }
794
795                 if (chdir("/") < 0) {
796                         log_error("chdir() failed: %m");
797                         goto child_fail;
798                 }
799
800                 umask(0022);
801
802                 loopback_setup();
803
804                 if (drop_capabilities() < 0)
805                         goto child_fail;
806
807                 if (arg_user) {
808
809                         if (get_user_creds((const char**)&arg_user, &uid, &gid, &home) < 0) {
810                                 log_error("get_user_creds() failed: %m");
811                                 goto child_fail;
812                         }
813
814                         if (mkdir_parents(home, 0775) < 0) {
815                                 log_error("mkdir_parents() failed: %m");
816                                 goto child_fail;
817                         }
818
819                         if (safe_mkdir(home, 0775, uid, gid) < 0) {
820                                 log_error("safe_mkdir() failed: %m");
821                                 goto child_fail;
822                         }
823
824                         if (initgroups((const char*)arg_user, gid) < 0) {
825                                 log_error("initgroups() failed: %m");
826                                 goto child_fail;
827                         }
828
829                         if (setresgid(gid, gid, gid) < 0) {
830                                 log_error("setregid() failed: %m");
831                                 goto child_fail;
832                         }
833
834                         if (setresuid(uid, uid, uid) < 0) {
835                                 log_error("setreuid() failed: %m");
836                                 goto child_fail;
837                         }
838                 }
839
840                 if ((asprintf((char**)(envp + 3), "HOME=%s", home? home: "/root") < 0) ||
841                     (asprintf((char**)(envp + 4), "USER=%s", arg_user? arg_user : "root") < 0) ||
842                     (asprintf((char**)(envp + 5), "LOGNAME=%s", arg_user? arg_user : "root") < 0)) {
843                     log_error("Out of memory");
844                     goto child_fail;
845                 }
846
847                 if ((hn = file_name_from_path(arg_directory)))
848                         sethostname(hn, strlen(hn));
849
850                 if (argc > optind)
851                         execvpe(argv[optind], argv + optind, (char**) envp);
852                 else {
853                         chdir(home ? home : "/root");
854                         execle("/bin/bash", "-bash", NULL, (char**) envp);
855                 }
856
857                 log_error("execv() failed: %m");
858
859         child_fail:
860                 _exit(EXIT_FAILURE);
861         }
862
863         if (process_pty(master, &mask) < 0)
864                 goto finish;
865
866         if (saved_attr_valid) {
867                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
868                 saved_attr_valid = false;
869         }
870
871         r = wait_for_terminate_and_warn(argc > optind ? argv[optind] : "bash", pid);
872
873         if (r < 0)
874                 r = EXIT_FAILURE;
875
876 finish:
877         if (saved_attr_valid)
878                 tcsetattr(STDIN_FILENO, TCSANOW, &saved_attr);
879
880         if (master >= 0)
881                 close_nointr_nofail(master);
882
883         if (oldcg)
884                 cg_attach(SYSTEMD_CGROUP_CONTROLLER, oldcg, 0);
885
886         if (newcg)
887                 cg_kill_recursive_and_wait(SYSTEMD_CGROUP_CONTROLLER, newcg, true);
888
889         free(arg_directory);
890         free(oldcg);
891         free(newcg);
892
893         return r;
894 }